crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_directory_exists
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_prompt_text(clinical_note):
|
|
18
|
+
# Create a prompt for the model to generate a care plan based on the clinical note
|
|
19
|
+
prompt = f"""
|
|
20
|
+
You are provided with a clinical note regarding a physician-patient interaction. Your task is to \
|
|
21
|
+
extract specific information based solely on the content provided. Do not hallucinate or infer details \
|
|
22
|
+
that are not explicitly stated in the text. Any information you include must be directly entailed by the text.
|
|
23
|
+
|
|
24
|
+
Instructions:
|
|
25
|
+
|
|
26
|
+
Extract the required information precisely as presented in the source text.
|
|
27
|
+
|
|
28
|
+
If the text does not contain specific information, clearly state "Not mentioned".
|
|
29
|
+
|
|
30
|
+
Maintain the patient's original wording whenever possible.
|
|
31
|
+
|
|
32
|
+
Response Format:
|
|
33
|
+
|
|
34
|
+
Chief Complaint
|
|
35
|
+
[ENTER CHIEF COMPLAINT]
|
|
36
|
+
|
|
37
|
+
History of Present Illness
|
|
38
|
+
|
|
39
|
+
Onset: When did it start? Did it begin suddenly or gradually? [ENTER ONSET INFORMATION]
|
|
40
|
+
|
|
41
|
+
Provoking/Palliating Factors: What makes the symptoms better or worse? [ENTER Provoking/Palliating Factors INFORMATION]
|
|
42
|
+
|
|
43
|
+
Quality: Describe the symptoms, e.g., sharp pain, dull pain, stabbing pain. [ENTER QUALITY INFORMATION]
|
|
44
|
+
|
|
45
|
+
Region/Radiation: Where are your symptoms located? Do they move? [ENTER REGION/RADIATION INFORMATION]
|
|
46
|
+
|
|
47
|
+
Severity: On a scale of 1 to 10, how severe are your symptoms? [ENTER SEVERITY INFORMATION]
|
|
48
|
+
|
|
49
|
+
Timing: When do you experience the symptoms? What times of day? [ENTER TIMING INFORMATION]
|
|
50
|
+
|
|
51
|
+
Related Symptoms: Are there any other symptoms related to the main complaint? [ENTER RELATED SYMPTOMS INFORMATION]
|
|
52
|
+
|
|
53
|
+
Ensure your responses are concise, accurate, and entirely supported by the provided text. \
|
|
54
|
+
Do not introduce external knowledge or assumptions.
|
|
55
|
+
|
|
56
|
+
Clinical Note:
|
|
57
|
+
{clinical_note}
|
|
58
|
+
"""
|
|
59
|
+
return prompt
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class CHWCarePlanScenario(Scenario):
|
|
63
|
+
"""
|
|
64
|
+
A scenario for MIMIC-IV discharge summaries where the task is to predict the ICD-10 code(s).
|
|
65
|
+
|
|
66
|
+
- Input: The clinical note (column "MO Note").
|
|
67
|
+
- Output: The clinical note (column "MO Note"). We will use this note as the reference for entailment.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
name = "chw_care_plan"
|
|
71
|
+
description = "A dataset containing free form text of a clinical health worker care plan, with the \
|
|
72
|
+
associated goal being to restructure that text into a given format."
|
|
73
|
+
tags = ["question_answering", "biomedical"]
|
|
74
|
+
|
|
75
|
+
def __init__(self):
|
|
76
|
+
"""
|
|
77
|
+
:param data_file: Path to the mimiciv_icd10.feather file.
|
|
78
|
+
"""
|
|
79
|
+
super().__init__()
|
|
80
|
+
self.data_file = "/share/pi/nigam/datasets/CHW_Dataset.csv"
|
|
81
|
+
|
|
82
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
83
|
+
ensure_directory_exists(os.path.dirname(self.data_file))
|
|
84
|
+
|
|
85
|
+
df = pd.read_csv(self.data_file) # columns: ["text", "target", ...]
|
|
86
|
+
|
|
87
|
+
instances: List[Instance] = []
|
|
88
|
+
|
|
89
|
+
# Use the entire dataset as one split (TEST_SPLIT)
|
|
90
|
+
for idx, row in df.iterrows():
|
|
91
|
+
note_text: str = row["MO Note"]
|
|
92
|
+
prompt_text = create_prompt_text(note_text)
|
|
93
|
+
if pd.isna(note_text):
|
|
94
|
+
print(f"Skipping row {idx} due to NaN value in 'MO Note'")
|
|
95
|
+
continue
|
|
96
|
+
# print(f"Prompt text: {prompt_text}")
|
|
97
|
+
|
|
98
|
+
# Create one Instance per patient
|
|
99
|
+
instance = Instance(
|
|
100
|
+
input=Input(text=prompt_text),
|
|
101
|
+
references=[Reference(Output(text=note_text), tags=[CORRECT_TAG])],
|
|
102
|
+
split=TEST_SPLIT,
|
|
103
|
+
)
|
|
104
|
+
instances.append(instance)
|
|
105
|
+
return instances
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_directory_exists
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CLEARScenario(Scenario):
|
|
18
|
+
"""
|
|
19
|
+
CLEARScenario is a dataset of human-labeled medical texts that indicate whether a patient has a history
|
|
20
|
+
of various medical conditions like alcohol dependence, depression, PTSD etc. Each example includes:
|
|
21
|
+
|
|
22
|
+
- text: A medical note or patient report.
|
|
23
|
+
- result_human: The human-provided label where:
|
|
24
|
+
1 indicates the patient has a history of the condition,
|
|
25
|
+
0 indicates the patient does not have a history of the condition,
|
|
26
|
+
2 indicates uncertainty about the patient's history of the condition.
|
|
27
|
+
|
|
28
|
+
For this scenario, the human label is mapped to a multiple-choice option as follows:
|
|
29
|
+
1 -> A, 0 -> B, 2 -> C
|
|
30
|
+
|
|
31
|
+
The task is to classify the text using a multiple-choice format.
|
|
32
|
+
|
|
33
|
+
Sample Synthetic Prompt:
|
|
34
|
+
You are a helpful medical assistant. Determine whether the patient has a history of <medical condition>.
|
|
35
|
+
|
|
36
|
+
Text: [insert text here]
|
|
37
|
+
|
|
38
|
+
A. Has a history of alcohol dependence
|
|
39
|
+
B. Does not have a history of alcohol dependence
|
|
40
|
+
C. Uncertain
|
|
41
|
+
|
|
42
|
+
Answer:
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# List of all available conditions
|
|
46
|
+
CONDITIONS = [
|
|
47
|
+
"alcohol_dependence",
|
|
48
|
+
"attention_deficit_hyperactivity_disorder",
|
|
49
|
+
"bipolar_disorder",
|
|
50
|
+
"chronic_pain",
|
|
51
|
+
"homelessness",
|
|
52
|
+
"liver_disease",
|
|
53
|
+
"major_depression",
|
|
54
|
+
"personality_disorder",
|
|
55
|
+
"post_traumatic_stress_disorder",
|
|
56
|
+
"substance_use_disorder",
|
|
57
|
+
"suicidal_behavior",
|
|
58
|
+
"tobacco_dependence",
|
|
59
|
+
"unemployment",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
# Map condition names to human-readable descriptions for prompts
|
|
63
|
+
CONDITION_PROMPTS = {
|
|
64
|
+
"alcohol_dependence": "alcohol dependence",
|
|
65
|
+
"attention_deficit_hyperactivity_disorder": "attention deficit hyperactivity disorder (ADHD)",
|
|
66
|
+
"bipolar_disorder": "bipolar disorder",
|
|
67
|
+
"chronic_pain": "chronic pain",
|
|
68
|
+
"homelessness": "homelessness",
|
|
69
|
+
"liver_disease": "liver disease",
|
|
70
|
+
"major_depression": "major depression",
|
|
71
|
+
"personality_disorder": "personality disorder",
|
|
72
|
+
"post_traumatic_stress_disorder": "post-traumatic stress disorder (PTSD)",
|
|
73
|
+
"substance_use_disorder": "substance use disorder",
|
|
74
|
+
"suicidal_behavior": "suicidal behavior",
|
|
75
|
+
"tobacco_dependence": "tobacco dependence",
|
|
76
|
+
"unemployment": "unemployment",
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def __init__(self, condition: str):
|
|
80
|
+
"""Initialize the scenario with a specific medical condition"""
|
|
81
|
+
super().__init__()
|
|
82
|
+
|
|
83
|
+
if condition not in self.CONDITIONS:
|
|
84
|
+
raise ValueError(f"Condition '{condition}' not supported. Available conditions: {self.CONDITIONS}")
|
|
85
|
+
|
|
86
|
+
self.condition = condition
|
|
87
|
+
self.name = f"clear_{condition}"
|
|
88
|
+
self.description = f"A dataset for evaluating {self.CONDITION_PROMPTS[condition]} detection from patient notes with yes/no/maybe classifications." # noqa: E501
|
|
89
|
+
self.tags = ["classification", "biomedical", condition.replace("_", "-")]
|
|
90
|
+
|
|
91
|
+
def get_answer_choices(self) -> List[str]:
|
|
92
|
+
"""Get the possible answer choices with the condition filled in."""
|
|
93
|
+
condition_text = self.CONDITION_PROMPTS[self.condition]
|
|
94
|
+
return [f"Has a history of {condition_text}", f"Does not have a history of {condition_text}", "Uncertain"]
|
|
95
|
+
|
|
96
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
97
|
+
"""Load and process the data for the specified conditon."""
|
|
98
|
+
data_dir = "/share/pi/nigam/suhana/medhelm/data/CLEAR/human_labeled/"
|
|
99
|
+
excel_path = os.path.join(data_dir, f"{self.condition}.xlsx")
|
|
100
|
+
ensure_directory_exists(os.path.dirname(excel_path))
|
|
101
|
+
|
|
102
|
+
df = pd.read_excel(excel_path)
|
|
103
|
+
|
|
104
|
+
possible_answer_choices = self.get_answer_choices()
|
|
105
|
+
|
|
106
|
+
instances: List[Instance] = []
|
|
107
|
+
# All instances are assigned to the test split (zero-shot setting)
|
|
108
|
+
split = TEST_SPLIT
|
|
109
|
+
condition_text = self.CONDITION_PROMPTS[self.condition]
|
|
110
|
+
|
|
111
|
+
for _, row in df.iterrows():
|
|
112
|
+
text = str(row.get("text", "")).strip()
|
|
113
|
+
label = str(row.get("result_human", "")).strip()
|
|
114
|
+
|
|
115
|
+
# Skip the instance if any required field is missing
|
|
116
|
+
if not text or not label:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
# Map the human label to the multiple-choice option: 1 -> A, 0 -> B, 2 -> C
|
|
120
|
+
label_mapping = {
|
|
121
|
+
"1": f"Has a history of {condition_text}",
|
|
122
|
+
"0": f"Does not have a history of {condition_text}",
|
|
123
|
+
"2": "Uncertain",
|
|
124
|
+
}
|
|
125
|
+
mapped_label = label_mapping.get(label, label)
|
|
126
|
+
|
|
127
|
+
# Build the References using the possible answer choices,
|
|
128
|
+
# marking the correct one with the CORRECT_TAG.
|
|
129
|
+
references: List[Reference] = [
|
|
130
|
+
Reference(Output(text=choice), tags=[CORRECT_TAG] if choice == mapped_label else [])
|
|
131
|
+
for choice in possible_answer_choices
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
input_text = (
|
|
135
|
+
f"You are a medical assistant reviewing patient notes. "
|
|
136
|
+
f"Determine whether the patient has a history of {condition_text}.\n\n"
|
|
137
|
+
f"Original Clinical Note:\n"
|
|
138
|
+
f"{text}\n\n"
|
|
139
|
+
f"A. Has a history of {condition_text}\n"
|
|
140
|
+
f"B. Does not have a history of {condition_text}\n"
|
|
141
|
+
f"C. Uncertain\n\n"
|
|
142
|
+
f"Answer:"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
instances.append(
|
|
146
|
+
Instance(
|
|
147
|
+
input=Input(text=input_text),
|
|
148
|
+
references=references,
|
|
149
|
+
split=split,
|
|
150
|
+
)
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
return instances
|
|
@@ -18,7 +18,7 @@ from helm.common.general import (
|
|
|
18
18
|
ensure_directory_exists,
|
|
19
19
|
)
|
|
20
20
|
from helm.common.hierarchical_logger import hlog
|
|
21
|
-
from .scenario import (
|
|
21
|
+
from helm.benchmark.scenarios.scenario import (
|
|
22
22
|
Scenario,
|
|
23
23
|
Instance,
|
|
24
24
|
Reference,
|
|
@@ -29,7 +29,7 @@ from .scenario import (
|
|
|
29
29
|
Output,
|
|
30
30
|
get_scenario_cache_path,
|
|
31
31
|
)
|
|
32
|
-
from .code_scenario import CodeReference, CodeInstance
|
|
32
|
+
from helm.benchmark.scenarios.code_scenario import CodeReference, CodeInstance
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
CLEVA_DATA_URL = "http://39.108.215.175/data"
|
|
@@ -57,9 +57,19 @@ from typing import List, Dict, Iterable, Optional, cast
|
|
|
57
57
|
|
|
58
58
|
from helm.common.general import ensure_file_downloaded
|
|
59
59
|
from helm.common.hierarchical_logger import hlog
|
|
60
|
-
from .code_scenario_helper import run as run_reindent
|
|
61
|
-
from .code_scenario_apps_pinned_file_order import apps_listdir_with_pinned_order
|
|
62
|
-
from .scenario import
|
|
60
|
+
from helm.benchmark.scenarios.code_scenario_helper import run as run_reindent
|
|
61
|
+
from helm.benchmark.scenarios.code_scenario_apps_pinned_file_order import apps_listdir_with_pinned_order
|
|
62
|
+
from helm.benchmark.scenarios.scenario import (
|
|
63
|
+
Scenario,
|
|
64
|
+
Instance,
|
|
65
|
+
Reference,
|
|
66
|
+
TRAIN_SPLIT,
|
|
67
|
+
VALID_SPLIT,
|
|
68
|
+
TEST_SPLIT,
|
|
69
|
+
CORRECT_TAG,
|
|
70
|
+
Input,
|
|
71
|
+
Output,
|
|
72
|
+
)
|
|
63
73
|
|
|
64
74
|
|
|
65
75
|
class CodeReference(Reference):
|
|
@@ -292,7 +302,10 @@ class CodeScenario(Scenario):
|
|
|
292
302
|
self.human_eval_hparams = dict(num_train_instances=0, num_val_instances=0, num_test_instances=164)
|
|
293
303
|
|
|
294
304
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
295
|
-
# By construction, output_path == '
|
|
305
|
+
# By construction, output_path == args.output_path + '/scenarios/code'
|
|
306
|
+
# where args.output_path is parsed from the command line argument.
|
|
307
|
+
# The default self.output_path here is '/benchmark_output/scenarios/ice'.
|
|
308
|
+
# See helm.benchmark.runner for more details about args.output_path.
|
|
296
309
|
if self.dataset == "humaneval":
|
|
297
310
|
target_path = os.path.join(output_path, "HumanEval.jsonl")
|
|
298
311
|
ensure_file_downloaded(
|
|
@@ -4,7 +4,7 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from .scenario import (
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
8
|
Scenario,
|
|
9
9
|
Instance,
|
|
10
10
|
Reference,
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List, Any
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.general import ensure_file_downloaded
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ConvFinQACalcScenario(Scenario):
|
|
19
|
+
"""A mathematical calculation benchmark based on ConvFinQA.
|
|
20
|
+
|
|
21
|
+
Data source:
|
|
22
|
+
https://github.com/czyssrs/ConvFinQA
|
|
23
|
+
|
|
24
|
+
Reference:
|
|
25
|
+
Zhiyu Chen, Shiyang Li, Charese Smiley, Zhiqiang Ma, Sameena Shah, and William Yang Wang. 2022.
|
|
26
|
+
ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering.
|
|
27
|
+
In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing,
|
|
28
|
+
pages 6279–6292, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.
|
|
29
|
+
https://aclanthology.org/2022.emnlp-main.421
|
|
30
|
+
""" # noqa: E501
|
|
31
|
+
|
|
32
|
+
name = "conv_fin_qa_calc"
|
|
33
|
+
description = "A mathematical calculation benchmark based on ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering [(Chen ey al., 2022)](https://arxiv.org/pdf/2210.03849.pdf)." # noqa: E501
|
|
34
|
+
tags = ["question_answering", "finance"]
|
|
35
|
+
|
|
36
|
+
DATASET_DOWNLOAD_URL: str = (
|
|
37
|
+
"https://github.com/czyssrs/ConvFinQA/raw/cf3eed2d5984960bf06bb8145bcea5e80b0222a6/data.zip"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
_SPLIT_TO_JSON_FILE_NAME: Dict[str, str] = {TRAIN_SPLIT: "train_turn.json", VALID_SPLIT: "dev_turn.json"}
|
|
41
|
+
|
|
42
|
+
def make_pseudo_markdown_table(self, table: List[List[Any]], sep: str = "\n") -> str:
|
|
43
|
+
markdown_lines: List[str] = []
|
|
44
|
+
|
|
45
|
+
for row in table:
|
|
46
|
+
row_inner_markdown = " | ".join([str(cell) for cell in row])
|
|
47
|
+
row_markdown = f"| {row_inner_markdown} |"
|
|
48
|
+
markdown_lines.append(row_markdown)
|
|
49
|
+
|
|
50
|
+
return sep.join(markdown_lines)
|
|
51
|
+
|
|
52
|
+
def convert_to_instance(self, dic: Dict[str, Any], split: str, sep: str = "\n") -> Instance:
|
|
53
|
+
linearized_table = self.make_pseudo_markdown_table(dic["table"])
|
|
54
|
+
input_text = f"Table: {sep}{linearized_table}{sep}{sep}"
|
|
55
|
+
|
|
56
|
+
if "gold_ind" in dic["annotation"]:
|
|
57
|
+
facts = dic["annotation"]["gold_ind"]
|
|
58
|
+
elif "gold_inds" in dic["annotation"]:
|
|
59
|
+
facts = dic["annotation"]["gold_inds"]
|
|
60
|
+
else:
|
|
61
|
+
facts = {}
|
|
62
|
+
table_text = ""
|
|
63
|
+
for fact_type, fact in facts.items():
|
|
64
|
+
if "text" in fact_type:
|
|
65
|
+
table_text += fact
|
|
66
|
+
if table_text:
|
|
67
|
+
input_text += f"Text: {sep}{table_text}{sep}{sep}"
|
|
68
|
+
|
|
69
|
+
for ind, q in enumerate(dic["annotation"]["cur_dial"]):
|
|
70
|
+
if ind < len(dic["annotation"]["cur_dial"]) - 1:
|
|
71
|
+
input_text += f"Question: {q}{sep}Answer: {dic['annotation']['exe_ans_list'][ind]}{sep}"
|
|
72
|
+
else:
|
|
73
|
+
input_text += f"Question: {q}"
|
|
74
|
+
|
|
75
|
+
answer = str(dic["annotation"]["exe_ans"])
|
|
76
|
+
return Instance(
|
|
77
|
+
input=Input(text=input_text),
|
|
78
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
79
|
+
split=split,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
83
|
+
data_path = os.path.join(output_path, "data")
|
|
84
|
+
ensure_file_downloaded(
|
|
85
|
+
source_url=self.DATASET_DOWNLOAD_URL,
|
|
86
|
+
target_path=os.path.join(output_path, "data"),
|
|
87
|
+
unpack=True,
|
|
88
|
+
unpack_type="unzip",
|
|
89
|
+
)
|
|
90
|
+
instances: List[Instance] = []
|
|
91
|
+
for split, json_file_name in self._SPLIT_TO_JSON_FILE_NAME.items():
|
|
92
|
+
json_file_path = os.path.join(data_path, json_file_name)
|
|
93
|
+
with open(json_file_path) as f:
|
|
94
|
+
raw_instances = json.load(f)
|
|
95
|
+
for raw_instance in raw_instances:
|
|
96
|
+
instances.append(self.convert_to_instance(raw_instance, split))
|
|
97
|
+
return instances
|
|
@@ -4,7 +4,7 @@ import tqdm
|
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
6
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TEST_SPLIT, Input, Output
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, CORRECT_TAG, TEST_SPLIT, Input, Output
|
|
8
8
|
|
|
9
9
|
datatag2hash_text = {
|
|
10
10
|
# The "average" book.
|
|
@@ -2,7 +2,16 @@ import os
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
5
|
-
from .scenario import
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
ALL_SPLITS,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
class COVIDDialogScenario(Scenario):
|