crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from typing import List
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
from helm.common.general import ensure_directory_exists
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MTSamplesProceduresScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
Processes the MTSamples Procedure dataset, a subset of MTSamples,
|
|
19
|
+
specifically focusing on procedure-related medical notes.
|
|
20
|
+
This dataset contains transcribed medical reports detailing various procedures,
|
|
21
|
+
treatments, and surgical interventions.
|
|
22
|
+
|
|
23
|
+
- Extracts `PLAN`, `SUMMARY`, or `FINDINGS` sections as references.
|
|
24
|
+
- Ensures these sections are excluded from the input text.
|
|
25
|
+
- Filters out files that do not contain any of the three reference sections.
|
|
26
|
+
|
|
27
|
+
Data source: https://github.com/raulista1997/benchmarkdata/tree/main/mtsample_procedure
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
GIT_HASH = "c4c252443fa9c52afb6960f53e51be278639bea2"
|
|
31
|
+
GITHUB_DIR_URL = f"https://github.com/raulista1997/benchmarkdata/tree/{GIT_HASH}/mtsample_procedure"
|
|
32
|
+
RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsample_procedure/"
|
|
33
|
+
|
|
34
|
+
name = "mtsamples"
|
|
35
|
+
description = (
|
|
36
|
+
"A dataset that provides a patient note regarding an operation, with the objective to document the procedure."
|
|
37
|
+
)
|
|
38
|
+
tags = ["medical", "transcription", "plan_generation"]
|
|
39
|
+
|
|
40
|
+
def fetch_file_list(self) -> List[str]:
|
|
41
|
+
"""
|
|
42
|
+
Uses the GitHub API to fetch the list of `.txt` files in the dataset directory.
|
|
43
|
+
"""
|
|
44
|
+
api_url = "https://api.github.com/repos/raulista1997/benchmarkdata/contents/mtsample_procedure"
|
|
45
|
+
headers = {"Accept": "application/vnd.github+json"}
|
|
46
|
+
|
|
47
|
+
response = requests.get(api_url, headers=headers)
|
|
48
|
+
if response.status_code != 200:
|
|
49
|
+
raise Exception(f"Failed to fetch file list from GitHub API ({api_url})")
|
|
50
|
+
|
|
51
|
+
files = response.json()
|
|
52
|
+
return [file["name"] for file in files if file["name"].endswith(".txt")]
|
|
53
|
+
|
|
54
|
+
def download_file(self, file_name: str, output_dir: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
Downloads a text file from GitHub and saves it locally.
|
|
57
|
+
"""
|
|
58
|
+
file_url = self.RAW_BASE_URL + file_name
|
|
59
|
+
file_path = os.path.join(output_dir, file_name)
|
|
60
|
+
|
|
61
|
+
if not os.path.exists(file_path): # Avoid redundant downloads
|
|
62
|
+
response = requests.get(file_url)
|
|
63
|
+
if response.status_code != 200:
|
|
64
|
+
raise Exception(f"Failed to download {file_url}")
|
|
65
|
+
|
|
66
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
67
|
+
f.write(response.text)
|
|
68
|
+
|
|
69
|
+
return file_path
|
|
70
|
+
|
|
71
|
+
def extract_sections(self, text: str) -> tuple:
|
|
72
|
+
"""
|
|
73
|
+
Extracts `PLAN`, `SUMMARY`, and `FINDINGS` sections from the text.
|
|
74
|
+
Returns (plan, summary, findings) as a tuple, ensuring uppercase detection.
|
|
75
|
+
"""
|
|
76
|
+
plan, summary, findings = None, None, None
|
|
77
|
+
text_upper = text.upper()
|
|
78
|
+
|
|
79
|
+
if "PLAN:" in text_upper:
|
|
80
|
+
plan = text.split("PLAN:")[1].split("\n", 1)[0].strip()
|
|
81
|
+
|
|
82
|
+
if "SUMMARY:" in text_upper:
|
|
83
|
+
summary = text.split("SUMMARY:")[1].split("\n", 1)[0].strip()
|
|
84
|
+
|
|
85
|
+
if "FINDINGS:" in text_upper:
|
|
86
|
+
findings = text.split("FINDINGS:")[1].split("\n", 1)[0].strip()
|
|
87
|
+
|
|
88
|
+
return plan, summary, findings
|
|
89
|
+
|
|
90
|
+
def remove_sections(self, text: str) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Removes `PLAN`, `SUMMARY`, and `FINDINGS` sections from the input text.
|
|
93
|
+
"""
|
|
94
|
+
sections = ["PLAN:", "SUMMARY:", "FINDINGS:"]
|
|
95
|
+
for section in sections:
|
|
96
|
+
if section in text:
|
|
97
|
+
text = text.split(section)[0].strip() # Keep content before the section
|
|
98
|
+
return text
|
|
99
|
+
|
|
100
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
101
|
+
"""
|
|
102
|
+
Downloads, processes, and converts MTSamples data into HELM format.
|
|
103
|
+
"""
|
|
104
|
+
ensure_directory_exists(output_path)
|
|
105
|
+
|
|
106
|
+
# Fetch list of available files from GitHub
|
|
107
|
+
file_list = self.fetch_file_list()
|
|
108
|
+
|
|
109
|
+
instances = []
|
|
110
|
+
for file_name in file_list:
|
|
111
|
+
try:
|
|
112
|
+
# Download the text file
|
|
113
|
+
file_path = self.download_file(file_name, output_path)
|
|
114
|
+
|
|
115
|
+
# Read content
|
|
116
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
117
|
+
text_content = f.read().strip()
|
|
118
|
+
|
|
119
|
+
# Extract structured sections
|
|
120
|
+
plan, summary, findings = self.extract_sections(text_content)
|
|
121
|
+
|
|
122
|
+
# Use plan > summary > findings as reference text
|
|
123
|
+
reference_text = plan or summary or findings
|
|
124
|
+
if not reference_text:
|
|
125
|
+
continue # Ignore notes with no reference section
|
|
126
|
+
|
|
127
|
+
# Remove structured sections from input
|
|
128
|
+
cleaned_text = self.remove_sections(text_content)
|
|
129
|
+
|
|
130
|
+
# Create HELM instance
|
|
131
|
+
instances.append(
|
|
132
|
+
Instance(
|
|
133
|
+
input=Input(text=cleaned_text), # Processed text without sections
|
|
134
|
+
references=[Reference(Output(text=reference_text), tags=[CORRECT_TAG])],
|
|
135
|
+
split=TEST_SPLIT,
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
print(f"Error processing {file_name}: {e}")
|
|
140
|
+
|
|
141
|
+
return instances
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from typing import List
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
from helm.common.general import ensure_directory_exists
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MTSamplesReplicateScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
MTSamples.com is designed to give you access to a big collection of transcribed medical reports.
|
|
19
|
+
These samples can be used by learning, as well as working medical transcriptionists for their daily
|
|
20
|
+
transcription needs. We present the model with patient information and request it to generate a corresponding
|
|
21
|
+
treatment plan.
|
|
22
|
+
|
|
23
|
+
Sample Synthetic Prompt:
|
|
24
|
+
Given various information about a patient, return a reasonable treatment plan for the patient.
|
|
25
|
+
|
|
26
|
+
- Extracts `PLAN`, `SUMMARY`, or `FINDINGS` as the reference (PLAN preferred).
|
|
27
|
+
- Removes `PLAN` from the input text but keeps other sections.
|
|
28
|
+
- Ignores files that do not contain any of these reference sections.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
GIT_HASH = "ebc104a4f96c5b7602242f301e081e9934a23344"
|
|
32
|
+
API_BASE_URL = (
|
|
33
|
+
f"https://api.github.com/repos/raulista1997/benchmarkdata/contents/mtsamples_processed?ref={GIT_HASH}"
|
|
34
|
+
)
|
|
35
|
+
RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsamples_processed/"
|
|
36
|
+
|
|
37
|
+
name = "mtsamples_replicate"
|
|
38
|
+
description = (
|
|
39
|
+
"A dataset of clinical notes where the model is prompted to generate "
|
|
40
|
+
"a reasonable treatment plan for the patient based on transcribed medical reports."
|
|
41
|
+
)
|
|
42
|
+
tags = ["medical", "transcription", "plan_generation"]
|
|
43
|
+
|
|
44
|
+
def fetch_file_list(self) -> List[str]:
|
|
45
|
+
"""
|
|
46
|
+
Uses the GitHub API to fetch the list of `.txt` files at a specific commit.
|
|
47
|
+
"""
|
|
48
|
+
response = requests.get(self.API_BASE_URL)
|
|
49
|
+
if response.status_code != 200:
|
|
50
|
+
raise Exception(f"Failed to fetch file list from GitHub API: {response.text}")
|
|
51
|
+
|
|
52
|
+
files = response.json()
|
|
53
|
+
return [f["name"] for f in files if f["name"].endswith(".txt")]
|
|
54
|
+
|
|
55
|
+
def download_file(self, file_name: str, output_dir: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Downloads a text file from GitHub and saves it locally.
|
|
58
|
+
"""
|
|
59
|
+
file_url = self.RAW_BASE_URL + file_name
|
|
60
|
+
file_path = os.path.join(output_dir, file_name)
|
|
61
|
+
|
|
62
|
+
if not os.path.exists(file_path):
|
|
63
|
+
response = requests.get(file_url)
|
|
64
|
+
if response.status_code != 200:
|
|
65
|
+
raise Exception(f"Failed to download {file_url}")
|
|
66
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
67
|
+
f.write(response.text)
|
|
68
|
+
|
|
69
|
+
return file_path
|
|
70
|
+
|
|
71
|
+
def extract_sections(self, text: str) -> tuple:
|
|
72
|
+
"""
|
|
73
|
+
Extracts `PLAN`, `SUMMARY`, and `FINDINGS` sections from the text.
|
|
74
|
+
Returns (plan, summary, findings) as a tuple, ensuring uppercase detection.
|
|
75
|
+
"""
|
|
76
|
+
plan, summary, findings = None, None, None
|
|
77
|
+
text_upper = text.upper()
|
|
78
|
+
|
|
79
|
+
if "PLAN:" in text_upper:
|
|
80
|
+
plan = text.split("PLAN:")[1].split("\n", 1)[0].strip()
|
|
81
|
+
|
|
82
|
+
if "SUMMARY:" in text_upper:
|
|
83
|
+
summary = text.split("SUMMARY:")[1].split("\n", 1)[0].strip()
|
|
84
|
+
|
|
85
|
+
if "FINDINGS:" in text_upper:
|
|
86
|
+
findings = text.split("FINDINGS:")[1].split("\n", 1)[0].strip()
|
|
87
|
+
|
|
88
|
+
return plan, summary, findings
|
|
89
|
+
|
|
90
|
+
def remove_plan_section(self, text: str) -> str:
|
|
91
|
+
"""
|
|
92
|
+
Removes `PLAN:` section from the input text while keeping the rest.
|
|
93
|
+
"""
|
|
94
|
+
sections = ["PLAN:"]
|
|
95
|
+
for section in sections:
|
|
96
|
+
if section in text:
|
|
97
|
+
text = text.split(section)[0].strip() # Keep content before PLAN
|
|
98
|
+
return text
|
|
99
|
+
|
|
100
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
101
|
+
"""
|
|
102
|
+
Downloads, processes, and converts MTSamples data into HELM format.
|
|
103
|
+
"""
|
|
104
|
+
ensure_directory_exists(output_path)
|
|
105
|
+
|
|
106
|
+
# Fetch list of available files from GitHub
|
|
107
|
+
file_list = self.fetch_file_list()
|
|
108
|
+
|
|
109
|
+
instances = []
|
|
110
|
+
for file_name in file_list:
|
|
111
|
+
try:
|
|
112
|
+
# Download the text file
|
|
113
|
+
file_path = self.download_file(file_name, output_path)
|
|
114
|
+
|
|
115
|
+
# Read content
|
|
116
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
117
|
+
text_content = f.read().strip()
|
|
118
|
+
|
|
119
|
+
# Extract structured sections
|
|
120
|
+
plan, summary, findings = self.extract_sections(text_content)
|
|
121
|
+
|
|
122
|
+
# Use plan > summary > findings as reference text
|
|
123
|
+
reference_text = plan or summary or findings
|
|
124
|
+
if not reference_text:
|
|
125
|
+
continue # Ignore notes with no reference section
|
|
126
|
+
|
|
127
|
+
# Remove PLAN section from input
|
|
128
|
+
cleaned_text = self.remove_plan_section(text_content)
|
|
129
|
+
|
|
130
|
+
# Create HELM instance
|
|
131
|
+
instances.append(
|
|
132
|
+
Instance(
|
|
133
|
+
input=Input(text=cleaned_text), # Processed text without PLAN
|
|
134
|
+
references=[Reference(Output(text=reference_text), tags=[CORRECT_TAG])],
|
|
135
|
+
split=TEST_SPLIT,
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
print(f"Error processing {file_name}: {e}")
|
|
140
|
+
|
|
141
|
+
return instances
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_directory_exists
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Scenario,
|
|
13
|
+
Instance,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Reference,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
ORIGINAL_DEFINITIONS = {
|
|
20
|
+
"ABDOMINAL": "History of intra-abdominal surgery, small or large intestine resection, or small bowel obstruction",
|
|
21
|
+
"ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define “advanced” \
|
|
22
|
+
as having 2 or more of the following: • Taking 2 or more medications to treat CAD • \
|
|
23
|
+
History of myocardial infarction (MI) • Currently experiencing angina • Ischemia, past or present",
|
|
24
|
+
"ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
|
|
25
|
+
"ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)",
|
|
26
|
+
"CREATININE": "Serum creatinine level above the upper normal limit",
|
|
27
|
+
"DIETSUPP-2MOS": "Taken a dietary supplement (excluding vitamin D) in the past 2 months",
|
|
28
|
+
"DRUG-ABUSE": "Current or past history of drug abuse",
|
|
29
|
+
"ENGLISH": "Patient must speak English",
|
|
30
|
+
"HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
|
|
31
|
+
"KETO-1YR": "Diagnosis of ketoacidosis within the past year",
|
|
32
|
+
"MAJOR-DIABETES": "Major diabetes-related complication. For the purposes of this annotation, we define \
|
|
33
|
+
“major complication” (as opposed to “minor complication”) as any of the following that are a result of \
|
|
34
|
+
(or strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • \
|
|
35
|
+
Retinopathy • nephropathy • neuropathy",
|
|
36
|
+
"MAKES-DECISIONS": "Patient must make their own medical decisions",
|
|
37
|
+
"MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
|
|
38
|
+
}
|
|
39
|
+
# Custom definitions for better prompts
|
|
40
|
+
LONG_DEFINITIONS = {
|
|
41
|
+
"ABDOMINAL": "History of intra-abdominal surgery. This could include any form of intra-abdominal surgery, \
|
|
42
|
+
including but not limited to small/large intestine resection or small bowel obstruction",
|
|
43
|
+
"ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define \
|
|
44
|
+
“advanced” as having 2 or more of the following: (a) Taking 2 or more medications to treat CAD (b) History \
|
|
45
|
+
of myocardial infarction (MI) (c) Currently experiencing angina (d) Ischemia, past or present. \
|
|
46
|
+
The patient must have at least 2 of these categories (a,b,c,d) to meet this criterion, otherwise the patient \
|
|
47
|
+
does not meet this criterion. For ADVANCED-CAD, be strict in your evaluation of the patient -- if they just \
|
|
48
|
+
have cardiovascular disease, then they do not meet this criterion.",
|
|
49
|
+
"ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
|
|
50
|
+
"ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)..",
|
|
51
|
+
"CREATININE": "Serum creatinine level above the upper normal limit",
|
|
52
|
+
"DIETSUPP-2MOS": "Consumption of a dietary supplement (excluding vitamin D) in the past 2 months. To assess \
|
|
53
|
+
this criterion, go through the list of medications_and_supplements taken from the note. If a substance could \
|
|
54
|
+
potentially be used as a dietary supplement (i.e. it is commonly used as a dietary supplement, even if it \
|
|
55
|
+
is not explicitly stated as being used as a dietary supplement), then the patient meets this criterion. \
|
|
56
|
+
Be lenient and broad in what is considered a dietary supplement. For example, a 'multivitamin' and \
|
|
57
|
+
'calcium carbonate' should always be considered a dietary supplement if they are included in this list.",
|
|
58
|
+
"DRUG-ABUSE": "Current or past history of drug abuse",
|
|
59
|
+
"ENGLISH": "Patient speaks English. Assume that the patient speaks English, unless otherwise explicitly noted. \
|
|
60
|
+
If the patient's language is not mentioned in the note, then assume they speak English and thus meet \
|
|
61
|
+
this criteria.",
|
|
62
|
+
"HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
|
|
63
|
+
"KETO-1YR": "Diagnosis of ketoacidosis within the past year",
|
|
64
|
+
"MAJOR-DIABETES": "Major diabetes-related complication. Examples of “major complication” (as opposed to \
|
|
65
|
+
“minor complication”) include, but are not limited to, any of the following that are a result of (or \
|
|
66
|
+
strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • Retinopathy \
|
|
67
|
+
• nephropathy • neuropathy. Additionally, if multiple conditions together imply a severe case of diabetes, \
|
|
68
|
+
then count that as a major complication.",
|
|
69
|
+
"MAKES-DECISIONS": "Patient must make their own medical decisions. Assume that the patient makes their own \
|
|
70
|
+
medical decisions, unless otherwise explicitly noted. There is no information provided about the \
|
|
71
|
+
patient's ability to make their own medical decisions, then assume they do make their own decisions and \
|
|
72
|
+
therefore meet this criteria.\"",
|
|
73
|
+
"MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class XMLDataLoader:
|
|
78
|
+
def __init__(
|
|
79
|
+
self, path_to_folder: str, is_convert_to_numbers=True, is_split_text=True, is_remove_excessive_new_lines=True
|
|
80
|
+
):
|
|
81
|
+
self.path_to_folder = path_to_folder
|
|
82
|
+
self.is_convert_to_numbers = is_convert_to_numbers
|
|
83
|
+
self.is_split_text = is_split_text
|
|
84
|
+
self.is_remove_excessive_new_lines = is_remove_excessive_new_lines
|
|
85
|
+
|
|
86
|
+
def load_data(self) -> List[Dict[str, Any]]:
|
|
87
|
+
"""Main function: Data loader for the XML files"""
|
|
88
|
+
data = []
|
|
89
|
+
file_names = os.listdir(self.path_to_folder)
|
|
90
|
+
file_names = sorted([file for file in file_names if file.endswith(".xml")])
|
|
91
|
+
for file_name in file_names:
|
|
92
|
+
file_path = os.path.join(self.path_to_folder, file_name)
|
|
93
|
+
text, labels = self.parse_xml(file_path)
|
|
94
|
+
data.append({"patient_id": file_name.replace(".xml", ""), "ehr": text, "labels": labels})
|
|
95
|
+
|
|
96
|
+
return data
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def get_date_of_note(patient: Dict[str, Any], note_idx: int) -> Optional[str]:
|
|
100
|
+
"""Get date of note for patient"""
|
|
101
|
+
assert note_idx <= len(patient["ehr"]), f"{note_idx} out of bounds for {patient['patient_id']}"
|
|
102
|
+
note: str = patient["ehr"][note_idx]
|
|
103
|
+
match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
|
|
104
|
+
date = match.group(1) if match else None
|
|
105
|
+
if not date:
|
|
106
|
+
print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
|
|
107
|
+
return date
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def get_current_date_for_patient(patient: Dict[str, Any]) -> Optional[str]:
|
|
111
|
+
"""Get most recent date visible in files for a given patient"""
|
|
112
|
+
most_recent_date = None
|
|
113
|
+
for note in patient["ehr"]:
|
|
114
|
+
match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
|
|
115
|
+
most_recent_date = match.group(1) if match else most_recent_date
|
|
116
|
+
if not most_recent_date:
|
|
117
|
+
print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
|
|
118
|
+
return most_recent_date
|
|
119
|
+
|
|
120
|
+
def parse_xml(self, XML_file) -> Tuple[List[str], Dict[str, str]]:
|
|
121
|
+
tree = ET.parse(XML_file)
|
|
122
|
+
root = tree.getroot()
|
|
123
|
+
text_content = ""
|
|
124
|
+
result_text: List[str] = []
|
|
125
|
+
tags = {}
|
|
126
|
+
for elem in root.iter():
|
|
127
|
+
if elem.tag == "TEXT":
|
|
128
|
+
text_content = elem.text if elem.text else ""
|
|
129
|
+
if self.is_remove_excessive_new_lines:
|
|
130
|
+
text_content = self.remove_excessive_newlines(text_content)
|
|
131
|
+
if self.is_split_text:
|
|
132
|
+
result_text = self.split_text(text_content)
|
|
133
|
+
else:
|
|
134
|
+
result_text = [text_content]
|
|
135
|
+
elif elem.tag == "TAGS":
|
|
136
|
+
tags = self.read_tags(root)
|
|
137
|
+
return (result_text, tags)
|
|
138
|
+
|
|
139
|
+
def read_tags(self, root) -> Dict[str, str]:
|
|
140
|
+
"""Reads the tags from an XML file and returns a dictionary of tags"""
|
|
141
|
+
tags_dict = {}
|
|
142
|
+
for tag in root.iter("TAGS"):
|
|
143
|
+
for subtag in tag:
|
|
144
|
+
met_value = subtag.attrib.get("met")
|
|
145
|
+
if self.is_convert_to_numbers:
|
|
146
|
+
met_value = 1 if met_value == "met" else 0
|
|
147
|
+
tags_dict[subtag.tag] = met_value
|
|
148
|
+
return tags_dict
|
|
149
|
+
|
|
150
|
+
def split_text(self, text: str) -> List[str]:
|
|
151
|
+
split_char = "*" * 100
|
|
152
|
+
parts = [x.strip() for x in text.split(split_char) if x.strip() != ""]
|
|
153
|
+
return parts
|
|
154
|
+
|
|
155
|
+
def remove_excessive_newlines(self, text: str) -> str:
|
|
156
|
+
text = text.replace("\n\n\n", "\n")
|
|
157
|
+
return text
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class N2C2CTMatchingScenario(Scenario):
|
|
161
|
+
"""
|
|
162
|
+
From "Cohort selection for clinical trials: n2c2 2018 shared task track 1" (Stubbs et al. 2019).
|
|
163
|
+
N2C2 is a collection of 288 patients (202 train / 86 test), each with 2-5 deidentified real-world clinical notes.
|
|
164
|
+
We use the prompt LLM formulation from Wornow et al. (2024).
|
|
165
|
+
|
|
166
|
+
Citation
|
|
167
|
+
```
|
|
168
|
+
@article{stubbs2019cohort,
|
|
169
|
+
title={Cohort selection for clinical trials: n2c2 2018 shared task track 1},
|
|
170
|
+
author={Stubbs, Amber and Filannino, Michele and Soysal, Ergin and Henry, Samuel and Uzuner, {\"O}zlem},
|
|
171
|
+
journal={Journal of the American Medical Informatics Association},
|
|
172
|
+
volume={26},
|
|
173
|
+
number={11},
|
|
174
|
+
pages={1163--1171},
|
|
175
|
+
year={2019},
|
|
176
|
+
publisher={Oxford University Press}
|
|
177
|
+
}
|
|
178
|
+
@article{wornow2024zero,
|
|
179
|
+
title={Zero-shot clinical trial patient matching with llms},
|
|
180
|
+
author={Wornow, Michael and Lozano, Alejandro and Dash, Dev and Jindal, Jenelle and Mahaffey, \
|
|
181
|
+
Kenneth W and Shah, Nigam H},
|
|
182
|
+
journal={NEJM AI},
|
|
183
|
+
pages={AIcs2400360},
|
|
184
|
+
year={2024},
|
|
185
|
+
publisher={Massachusetts Medical Society}
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
name = "n2c2_ct_matching"
|
|
191
|
+
description = "A dataset that provides clinical notes and asks the model to classify whether the \
|
|
192
|
+
patient is a valid candidate for a provided clinical trial."
|
|
193
|
+
tags = [] # TODO
|
|
194
|
+
|
|
195
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = [
|
|
196
|
+
"yes",
|
|
197
|
+
"no",
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
def __init__(self, subject: str):
|
|
201
|
+
super().__init__()
|
|
202
|
+
self.subject: str = subject # specific inclusion criterion to assess
|
|
203
|
+
self.path_to_train_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/train/"
|
|
204
|
+
self.path_to_test_dir: str = "/share/pi/nigam/data/medhelm/n2c2_ct_matching/test/"
|
|
205
|
+
|
|
206
|
+
def create_prompt(self, patient: Dict[str, Any]) -> str:
|
|
207
|
+
# Cast None values to empty strings during string formatting, but keep the original functions returning None
|
|
208
|
+
notes_list = [
|
|
209
|
+
f"## Note #{i+1}\nDate: {XMLDataLoader.get_date_of_note(patient, i) or ''}\n{note}"
|
|
210
|
+
for i, note in enumerate(patient["ehr"])
|
|
211
|
+
]
|
|
212
|
+
notes: str = ("\n" + "*" * 50 + "\n\n").join(notes_list)
|
|
213
|
+
current_date = XMLDataLoader.get_current_date_for_patient(patient)
|
|
214
|
+
prompt = f"""
|
|
215
|
+
# Task
|
|
216
|
+
Your job is to decide whether the given patient meets the inclusion criterion for a clinical trial.
|
|
217
|
+
|
|
218
|
+
# Inclusion Criterion
|
|
219
|
+
The inclusion criterion being assessed is: "{self.subject}".
|
|
220
|
+
The definition of the inclusion criterion is: "{LONG_DEFINITIONS[self.subject]}".
|
|
221
|
+
|
|
222
|
+
# Patient Clinical Notes
|
|
223
|
+
Below is a set of {len(patient['ehr'])} clinical notes describing the patient's current health status. \
|
|
224
|
+
Each note is separated by a header with the date that the note was written, as well as a long list of asterisks.
|
|
225
|
+
|
|
226
|
+
{'-' * 100}
|
|
227
|
+
|
|
228
|
+
{notes}
|
|
229
|
+
|
|
230
|
+
{'-' * 100}
|
|
231
|
+
|
|
232
|
+
# Current Date
|
|
233
|
+
Assume that the current date is: {current_date}
|
|
234
|
+
|
|
235
|
+
# Question
|
|
236
|
+
Does the patient meet the inclusion criterion "{self.subject}"?
|
|
237
|
+
"""
|
|
238
|
+
return prompt
|
|
239
|
+
|
|
240
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
241
|
+
instances: List[Instance] = []
|
|
242
|
+
for split in ["train", "test"]:
|
|
243
|
+
# limit to zero shot setting
|
|
244
|
+
if split == "test":
|
|
245
|
+
path_to_data = self.path_to_train_dir if split == "train" else self.path_to_test_dir
|
|
246
|
+
ensure_directory_exists(path_to_data)
|
|
247
|
+
|
|
248
|
+
# Load dataset
|
|
249
|
+
dataloader = XMLDataLoader(path_to_data)
|
|
250
|
+
dataset = dataloader.load_data()
|
|
251
|
+
|
|
252
|
+
# Create instances
|
|
253
|
+
for patient in dataset:
|
|
254
|
+
is_met: bool = patient["labels"][self.subject]
|
|
255
|
+
correct_answer: str = "yes" if is_met else "no"
|
|
256
|
+
|
|
257
|
+
# Build `References. The possible answer choices are "yes" or "no"
|
|
258
|
+
references: List[Reference] = [
|
|
259
|
+
Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
260
|
+
for answer in N2C2CTMatchingScenario.POSSIBLE_ANSWER_CHOICES
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
instances.append(
|
|
264
|
+
Instance(
|
|
265
|
+
input=Input(text=self.create_prompt(patient)),
|
|
266
|
+
references=references,
|
|
267
|
+
split=TRAIN_SPLIT if split == "train" else TEST_SPLIT,
|
|
268
|
+
)
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
return instances
|
|
@@ -10,7 +10,7 @@ from helm.common.hierarchical_logger import htrack_block, hlog
|
|
|
10
10
|
from typing import List, Dict
|
|
11
11
|
|
|
12
12
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists, asdict_without_nones
|
|
13
|
-
from .scenario import (
|
|
13
|
+
from helm.benchmark.scenarios.scenario import (
|
|
14
14
|
Scenario,
|
|
15
15
|
Instance,
|
|
16
16
|
Reference,
|
|
@@ -15,7 +15,16 @@ from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
|
15
15
|
from helm.common.authentication import Authentication
|
|
16
16
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
17
17
|
from helm.proxy.services.server_service import ServerService
|
|
18
|
-
from .scenario import
|
|
18
|
+
from helm.benchmark.scenarios.scenario import (
|
|
19
|
+
Scenario,
|
|
20
|
+
Instance,
|
|
21
|
+
Reference,
|
|
22
|
+
TRAIN_SPLIT,
|
|
23
|
+
TEST_SPLIT,
|
|
24
|
+
CORRECT_TAG,
|
|
25
|
+
Input,
|
|
26
|
+
Output,
|
|
27
|
+
)
|
|
19
28
|
|
|
20
29
|
try:
|
|
21
30
|
import sympy
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import List, Any
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OABExamsScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
The OAB Exam is a mandatory test for anyone who wants to practice law in Brazil. The exam is composed for
|
|
19
|
+
an objective test with 80 multiple-choice questions covering all areas of Law and a written phase focused
|
|
20
|
+
on a specific legal area (e.g., Civil, Criminal, Labor Law), where candidates must draft a legal document
|
|
21
|
+
and answer four essay questions.
|
|
22
|
+
|
|
23
|
+
This dataset is composed by the exams that occured between 2010 and 2018.
|
|
24
|
+
|
|
25
|
+
The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/oab_exams
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "oab_exams"
|
|
29
|
+
description = "OAB exams dataset"
|
|
30
|
+
tags = ["knowledge", "multiple_choice", "pt-br"]
|
|
31
|
+
|
|
32
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
33
|
+
# Download the raw data and read all the dialogues
|
|
34
|
+
dataset: Any
|
|
35
|
+
# Read all the instances
|
|
36
|
+
instances: List[Instance] = []
|
|
37
|
+
cache_dir = str(Path(output_path) / "data")
|
|
38
|
+
|
|
39
|
+
dataset = load_dataset("eduagarcia/oab_exams", cache_dir=cache_dir)
|
|
40
|
+
for example in dataset["train"]:
|
|
41
|
+
question = example["question"]
|
|
42
|
+
choices = example["choices"]
|
|
43
|
+
answer = example["answerKey"]
|
|
44
|
+
# Skipping every canceled question!
|
|
45
|
+
if example["nullified"]:
|
|
46
|
+
continue
|
|
47
|
+
answers_dict = dict(zip(choices["label"], choices["text"]))
|
|
48
|
+
correct_answer = answers_dict[answer]
|
|
49
|
+
|
|
50
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
51
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
52
|
+
|
|
53
|
+
instance = Instance(
|
|
54
|
+
input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
|
|
55
|
+
)
|
|
56
|
+
instances.append(instance)
|
|
57
|
+
return instances
|