crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import List
|
|
3
|
+
from helm.benchmark.scenarios.scenario import (
|
|
4
|
+
Scenario,
|
|
5
|
+
Instance,
|
|
6
|
+
Reference,
|
|
7
|
+
TEST_SPLIT,
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
PassageQuestionInput,
|
|
10
|
+
Output,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MentalHealthScenario(Scenario):
|
|
15
|
+
"""
|
|
16
|
+
This scenario evaluates language models' ability to generate appropriate counseling responses
|
|
17
|
+
in mental health conversations. The dataset contains counseling dialogues covering
|
|
18
|
+
various topics including workplace issues, anxiety, suicidal thoughts, relationship
|
|
19
|
+
problems, and more.
|
|
20
|
+
|
|
21
|
+
Each dialogue consists of interactions between a counselor and a client, where the counselor
|
|
22
|
+
demonstrates expert mental health counseling techniques. The dialogues were selected based on high
|
|
23
|
+
quality scores from multiple evaluators.
|
|
24
|
+
|
|
25
|
+
Example dialogue structure:
|
|
26
|
+
```
|
|
27
|
+
counselor: Hi there, to start can you tell me your name and a little bit about what's been going on?
|
|
28
|
+
client: I sleep too much... I'm 23, female and work as IT professional. I feel like I'm not fitting in...
|
|
29
|
+
counselor: I can see you have been facing challenges with feeling like you don't fit in...
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
The task is to generate the next counselor response given the conversation history. Models
|
|
33
|
+
are evaluated on their ability to:
|
|
34
|
+
1. Provide empathetic and supportive responses
|
|
35
|
+
2. Follow proper mental health counseling protocols
|
|
36
|
+
3. Generate contextually appropriate interventions
|
|
37
|
+
|
|
38
|
+
The dataset includes:
|
|
39
|
+
- 7 complete dialogues covering different mental health topics
|
|
40
|
+
- Metadata about dialogue topic and type
|
|
41
|
+
- Gold-standard counselor responses as references
|
|
42
|
+
- Full conversation history for context
|
|
43
|
+
|
|
44
|
+
Each instance includes:
|
|
45
|
+
- input: Previous conversation turns formatted with speaker labels
|
|
46
|
+
- reference: The actual counselor's response (gold standard)
|
|
47
|
+
- metadata: Topic and type of mental health conversation
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
name = "mental_health"
|
|
51
|
+
description = "A dataset containing a counselor and mental health patient conversation, where the objective is to \
|
|
52
|
+
generate an empathetic counselor response."
|
|
53
|
+
tags = ["dialogue", "counseling", "mental_health", "empathy", "healthcare"]
|
|
54
|
+
|
|
55
|
+
def process_dialogue_data(self, data: pd.DataFrame) -> List[Instance]:
|
|
56
|
+
"""
|
|
57
|
+
Process the dialogue data into evaluation instances.
|
|
58
|
+
Each instance represents a point in the conversation where the model needs to generate
|
|
59
|
+
a counselor response.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
data (pd.DataFrame): DataFrame containing processed dialogues with columns:
|
|
63
|
+
- context: Previous conversation history
|
|
64
|
+
- gold_counselor_response: Reference counselor response
|
|
65
|
+
- topic: Type of conversation (e.g., "Anxiety", "Workplace")
|
|
66
|
+
- dialogue_type: Numerical type identifier
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
List[Instance]: List of processed instances ready for evaluation
|
|
70
|
+
"""
|
|
71
|
+
instances: List[Instance] = []
|
|
72
|
+
|
|
73
|
+
for _, row in data.iterrows():
|
|
74
|
+
# Format input with clear section breaks and instructions
|
|
75
|
+
input_text = (
|
|
76
|
+
f"Topic: {row['topic']}\n"
|
|
77
|
+
f"Type: {row['dialogue_type']}\n"
|
|
78
|
+
f"\nPrevious conversation:\n{row['context']}\n"
|
|
79
|
+
f"\nGenerate an empathetic and appropriate counselor response:"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Create input with empty passage since all context is in question
|
|
83
|
+
prompt = PassageQuestionInput(passage="", question=input_text)
|
|
84
|
+
|
|
85
|
+
# Create instance with gold standard response
|
|
86
|
+
instance = Instance(
|
|
87
|
+
input=prompt,
|
|
88
|
+
references=[Reference(Output(text=row["gold_counselor_response"]), tags=[CORRECT_TAG])],
|
|
89
|
+
split=TEST_SPLIT,
|
|
90
|
+
)
|
|
91
|
+
instances.append(instance)
|
|
92
|
+
|
|
93
|
+
return instances
|
|
94
|
+
|
|
95
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
96
|
+
"""
|
|
97
|
+
Load and process the mental health dialogue dataset.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
output_path (str): Path for any cached or intermediate files
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List[Instance]: List of processed instances for evaluation
|
|
104
|
+
"""
|
|
105
|
+
# Load the processed dialogue data
|
|
106
|
+
data_path = "/share/pi/nigam/data/medhelm/mental_health/processed_dialogues.csv"
|
|
107
|
+
dialogue_data = pd.read_csv(data_path)
|
|
108
|
+
|
|
109
|
+
# Process into instances
|
|
110
|
+
instances = self.process_dialogue_data(dialogue_data)
|
|
111
|
+
|
|
112
|
+
return instances
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from helm.common.general import ensure_directory_exists
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MIMICBHCScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
|
|
19
|
+
course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
|
|
20
|
+
|
|
21
|
+
In total, the dataset contains 270,033 clinical notes.
|
|
22
|
+
The splits are provided by the dataset itself.
|
|
23
|
+
|
|
24
|
+
Sample Synthetic Prompt:
|
|
25
|
+
Summarize the clinical note into a brief hospital course.
|
|
26
|
+
|
|
27
|
+
Clinical Note:
|
|
28
|
+
<SEX> M <SERVICE> SURGERY <ALLERGIES> No Known Allergies \/ Adverse Drug Reactions
|
|
29
|
+
...
|
|
30
|
+
continue to follow-up with your health care providers as an outpatient.
|
|
31
|
+
|
|
32
|
+
Brief Hospital Course:
|
|
33
|
+
Mr. ___ was pre-admitted on ___ for liver transplantation
|
|
34
|
+
...
|
|
35
|
+
discharged home to continue home medications and follow-up as an outpatient.
|
|
36
|
+
|
|
37
|
+
@article{aali2024dataset,
|
|
38
|
+
title={A dataset and benchmark for hospital course summarization with adapted large language models},
|
|
39
|
+
author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
|
|
40
|
+
and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
|
|
41
|
+
and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
|
|
42
|
+
journal={Journal of the American Medical Informatics Association},
|
|
43
|
+
volume={32},
|
|
44
|
+
number={3},
|
|
45
|
+
pages={470--479},
|
|
46
|
+
year={2024},
|
|
47
|
+
publisher={Oxford University Press}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
@article{aali2024mimic,
|
|
51
|
+
title={MIMIC-IV-Ext-BHC: Labeled Clinical Notes Dataset for Hospital Course Summarization},
|
|
52
|
+
author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
|
|
53
|
+
and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
|
|
54
|
+
and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
|
|
55
|
+
journal={PhysioNet},
|
|
56
|
+
year={2024}
|
|
57
|
+
}
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
name = "mimic_bhc"
|
|
61
|
+
description = (
|
|
62
|
+
"A summarization task using a curated collection of preprocessed discharge notes"
|
|
63
|
+
" paired with their corresponding brief hospital course (BHC) summaries."
|
|
64
|
+
)
|
|
65
|
+
tags = ["summarization", "biomedical"]
|
|
66
|
+
|
|
67
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
68
|
+
data_path = "/share/pi/nigam/data/bhc-mimiciv/"
|
|
69
|
+
ensure_directory_exists(data_path)
|
|
70
|
+
data_path = data_path + "mimic_iv_bhc.json"
|
|
71
|
+
|
|
72
|
+
instances: List[Instance] = []
|
|
73
|
+
# Limit to zero shot setting for now
|
|
74
|
+
splits: Dict[str, str] = {
|
|
75
|
+
# "train": TRAIN_SPLIT,
|
|
76
|
+
# "validate": VALID_SPLIT,
|
|
77
|
+
"test": TEST_SPLIT,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
with open(data_path, "r") as f:
|
|
81
|
+
data = [json.loads(line) for line in f]
|
|
82
|
+
|
|
83
|
+
for data_split, split in splits.items():
|
|
84
|
+
clinical_notes: List[str] = [x["input"] for x in data]
|
|
85
|
+
bhc_summaries: List[str] = [x["target"] for x in data]
|
|
86
|
+
assert len(clinical_notes) == len(bhc_summaries), "Notes and summaries must have the same length"
|
|
87
|
+
for clinical_note, bhc_summary in zip(clinical_notes, bhc_summaries):
|
|
88
|
+
if not clinical_note or not bhc_summary:
|
|
89
|
+
continue
|
|
90
|
+
instances.append(
|
|
91
|
+
Instance(
|
|
92
|
+
input=Input(text=clinical_note),
|
|
93
|
+
references=[Reference(Output(text=bhc_summary), tags=[CORRECT_TAG])],
|
|
94
|
+
split=split,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return instances
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from helm.common.general import ensure_directory_exists
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MIMICRRSScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
MIMIC-RRS is a biomedical question answering (QA) dataset collected from MIMIC-III and MIMIC-CXR
|
|
19
|
+
radiology reports.
|
|
20
|
+
In this scenario, we only consider the radiology reports from MIMIC-III.
|
|
21
|
+
In total, the dataset contains 73,259 reports.
|
|
22
|
+
The splits are provided by the dataset itself.
|
|
23
|
+
|
|
24
|
+
Sample Synthetic Prompt:
|
|
25
|
+
Generate the impressions of a radiology report based on its findings.
|
|
26
|
+
|
|
27
|
+
Findings:
|
|
28
|
+
The heart is normal in size. The lungs are clear.
|
|
29
|
+
|
|
30
|
+
Impressions:
|
|
31
|
+
|
|
32
|
+
@inproceedings{Chen_2023,
|
|
33
|
+
title={Toward Expanding the Scope of Radiology Report Summarization to Multiple Anatomies and Modalities},
|
|
34
|
+
url={http://dx.doi.org/10.18653/v1/2023.acl-short.41},
|
|
35
|
+
DOI={10.18653/v1/2023.acl-short.41},
|
|
36
|
+
booktitle={Proceedings of the 61st Annual Meeting of the Association
|
|
37
|
+
for Computational Linguistics (Volume 2: Short Papers)},
|
|
38
|
+
publisher={Association for Computational Linguistics},
|
|
39
|
+
author={Chen, Zhihong and Varma, Maya and Wan, Xiang and Langlotz, Curtis and Delbrouck, Jean-Benoit},
|
|
40
|
+
year={2023},
|
|
41
|
+
pages={469–484}
|
|
42
|
+
}
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
name = "mimic_rrs"
|
|
46
|
+
description = (
|
|
47
|
+
"A dataset containing radiology reports with findings sections from MIMIC-III paired with"
|
|
48
|
+
" their corresponding impression sections, used for generating radiology report summaries."
|
|
49
|
+
)
|
|
50
|
+
tags = ["question_answering", "biomedical"]
|
|
51
|
+
|
|
52
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
|
+
data_path = "/share/pi/nigam/data/rrs-mimiciii/all"
|
|
54
|
+
ensure_directory_exists(data_path)
|
|
55
|
+
|
|
56
|
+
instances: List[Instance] = []
|
|
57
|
+
# Limit to zero shot setting for now
|
|
58
|
+
splits: Dict[str, str] = {
|
|
59
|
+
# "train": TRAIN_SPLIT,
|
|
60
|
+
# "validate": VALID_SPLIT,
|
|
61
|
+
"test": TEST_SPLIT,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
for data_split, split in splits.items():
|
|
65
|
+
split_findings_name: str = f"{data_split}.findings.tok"
|
|
66
|
+
split_impressions_name: str = f"{data_split}.impression.tok"
|
|
67
|
+
findings_path: str = os.path.join(data_path, split_findings_name)
|
|
68
|
+
impressions_path: str = os.path.join(data_path, split_impressions_name)
|
|
69
|
+
findings: List[str] = self.read_file(findings_path)
|
|
70
|
+
impressions: List[str] = self.read_file(impressions_path)
|
|
71
|
+
assert len(findings) == len(impressions), "Findings and impressions must have the same length"
|
|
72
|
+
for finding, impression in zip(findings, impressions):
|
|
73
|
+
if not finding or not impression:
|
|
74
|
+
continue
|
|
75
|
+
instances.append(
|
|
76
|
+
Instance(
|
|
77
|
+
input=Input(text=finding),
|
|
78
|
+
references=[Reference(Output(text=impression), tags=[CORRECT_TAG])],
|
|
79
|
+
split=split,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return instances
|
|
84
|
+
|
|
85
|
+
def read_file(self, file_path: str) -> List[str]:
|
|
86
|
+
with open(file_path, "r") as file:
|
|
87
|
+
lines = file.readlines()
|
|
88
|
+
lines = [line.strip() for line in lines]
|
|
89
|
+
return lines
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Input,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Reference,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MIMICIVBillingCodeScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
A scenario for MIMIC-IV discharge summaries where the task is to predict the ICD-10 code(s).
|
|
21
|
+
|
|
22
|
+
- Input: The clinical note (column "text").
|
|
23
|
+
- Output: The list of ICD-10 codes (column "target").
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "mimiciv_billing_code"
|
|
27
|
+
description = "A dataset pairing clinical notes from MIMIC-IV with corresponding ICD-10 billing codes."
|
|
28
|
+
tags = ["question_answering", "biomedical"]
|
|
29
|
+
|
|
30
|
+
def __init__(self, data_file: str):
|
|
31
|
+
"""
|
|
32
|
+
:param data_file: Path to the mimiciv_icd10.feather file.
|
|
33
|
+
"""
|
|
34
|
+
super().__init__()
|
|
35
|
+
self.data_file = data_file
|
|
36
|
+
|
|
37
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
38
|
+
ensure_directory_exists(os.path.dirname(self.data_file))
|
|
39
|
+
|
|
40
|
+
# Read the preprocessed MIMIC-IV data (.feather format)
|
|
41
|
+
df = pd.read_feather(self.data_file) # columns: ["text", "target", ...]
|
|
42
|
+
|
|
43
|
+
instances: List[Instance] = []
|
|
44
|
+
|
|
45
|
+
# Use the entire dataset as one split (TEST_SPLIT)
|
|
46
|
+
for idx, row in df.iterrows():
|
|
47
|
+
try:
|
|
48
|
+
note_text: str = row["text"]
|
|
49
|
+
icd10_codes = row["target"]
|
|
50
|
+
|
|
51
|
+
# Convert numpy array to list if necessary
|
|
52
|
+
if isinstance(icd10_codes, np.ndarray):
|
|
53
|
+
icd10_codes = icd10_codes.tolist()
|
|
54
|
+
elif not isinstance(icd10_codes, list):
|
|
55
|
+
icd10_codes = [str(icd10_codes)] # Handle single values
|
|
56
|
+
|
|
57
|
+
# Convert all codes to strings and join
|
|
58
|
+
codes_str = ",".join(str(code) for code in icd10_codes)
|
|
59
|
+
|
|
60
|
+
# Create one Instance per row
|
|
61
|
+
instance = Instance(
|
|
62
|
+
input=Input(text=note_text),
|
|
63
|
+
references=[Reference(Output(text=codes_str), tags=[CORRECT_TAG])],
|
|
64
|
+
split=TEST_SPLIT,
|
|
65
|
+
)
|
|
66
|
+
instances.append(instance)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
print(f"Error processing row {idx}: {str(e)}")
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
return instances
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MMLU_Clinical_Afr_Scenario(Scenario):
|
|
11
|
+
"""
|
|
12
|
+
https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
name = "mmlu_clinical_afr"
|
|
16
|
+
description = "Massive Multitask Language Understanding (MMLU) translated into 11 African low-resource languages"
|
|
17
|
+
tags = ["knowledge", "multiple_choice", "low_resource_languages"]
|
|
18
|
+
|
|
19
|
+
def __init__(self, subject: str = "clinical_knowledge", lang: str = "af"):
|
|
20
|
+
super().__init__()
|
|
21
|
+
self.subject: str = subject
|
|
22
|
+
self.lang: str = lang
|
|
23
|
+
|
|
24
|
+
def download_mmlu_clinical_afr(self, path: str):
|
|
25
|
+
ensure_file_downloaded(
|
|
26
|
+
source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip", # noqa: E501
|
|
27
|
+
target_path=path,
|
|
28
|
+
unpack=True,
|
|
29
|
+
unpack_type="unzip",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def process_csv(self, csv_path: str, split: str) -> List[Instance]:
|
|
33
|
+
instances: List[Instance] = []
|
|
34
|
+
hlog(f"Reading {csv_path}")
|
|
35
|
+
with open(csv_path) as f:
|
|
36
|
+
reader = csv.reader(f, delimiter=",")
|
|
37
|
+
for row in reader:
|
|
38
|
+
|
|
39
|
+
question, answers, correct_choice = row[0], row[1:-1], row[-1]
|
|
40
|
+
answers_dict = dict(zip(["A", "B", "C", "D"], answers))
|
|
41
|
+
correct_answer: str = answers_dict[correct_choice]
|
|
42
|
+
|
|
43
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
44
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
45
|
+
|
|
46
|
+
instance = Instance(
|
|
47
|
+
input=Input(text=question),
|
|
48
|
+
references=list(map(answer_to_reference, answers)),
|
|
49
|
+
split=split,
|
|
50
|
+
)
|
|
51
|
+
instances.append(instance)
|
|
52
|
+
return instances
|
|
53
|
+
|
|
54
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
55
|
+
# Download the raw data
|
|
56
|
+
desired_dir = "mmlu_cm_ck_vir"
|
|
57
|
+
data_path: str = os.path.join(output_path, desired_dir)
|
|
58
|
+
self.download_mmlu_clinical_afr(data_path)
|
|
59
|
+
|
|
60
|
+
# Read all the instances
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
splits: Dict[str, str] = {
|
|
63
|
+
"dev": TRAIN_SPLIT,
|
|
64
|
+
"val": VALID_SPLIT,
|
|
65
|
+
"test": TEST_SPLIT,
|
|
66
|
+
}
|
|
67
|
+
for split in splits:
|
|
68
|
+
csv_path: str = os.path.join(data_path, desired_dir, f"{self.subject}_{split}_{self.lang}.csv")
|
|
69
|
+
if not os.path.exists(csv_path):
|
|
70
|
+
hlog(f"{csv_path} doesn't exist, skipping")
|
|
71
|
+
continue
|
|
72
|
+
instances.extend(self.process_csv(csv_path, splits[split]))
|
|
73
|
+
|
|
74
|
+
return instances
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
from datasets import Dataset, load_dataset
|
|
3
|
+
|
|
4
|
+
from helm.common.hierarchical_logger import hlog
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MMLUProScenario(Scenario):
|
|
18
|
+
"""
|
|
19
|
+
The MMLU-Pro dataset is an advanced version of the Massive Multitask Language Understanding (MMLU)
|
|
20
|
+
benchmark, created to push the boundaries of language models' reasoning and comprehension skills.
|
|
21
|
+
Designed as a more challenging evaluation, it increases the answer options per question from four
|
|
22
|
+
to ten, significantly reducing the likelihood of correct random guesses. This update makes the
|
|
23
|
+
dataset better at distinguishing the capabilities of models on complex tasks.
|
|
24
|
+
|
|
25
|
+
MMLU-Pro emphasizes reasoning over simple factual recall by integrating diverse, intricate questions
|
|
26
|
+
across 14 domains, including subjects like biology, economics, law, and psychology. In addition, it
|
|
27
|
+
addresses limitations in the original MMLU by filtering out trivial questions, making it a more
|
|
28
|
+
robust benchmark. Performance comparisons suggest that models benefit from reasoning-based
|
|
29
|
+
approaches (such as Chain of Thought, or CoT) on MMLU-Pro, which contrasts with the original
|
|
30
|
+
MMLU where CoT didn’t show as much benefit. This makes MMLU-Pro especially suitable for evaluating
|
|
31
|
+
advanced models that rely on nuanced reasoning and comprehension skills.
|
|
32
|
+
|
|
33
|
+
Dataset: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
|
|
34
|
+
Paper: https://arxiv.org/abs/2406.01574
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
name = "mmlu_pro"
|
|
38
|
+
description = "Enhanced Massive Multitask Language Understanding with increased options and reasoning"
|
|
39
|
+
tags = ["knowledge", "multiple_choice", "reasoning"]
|
|
40
|
+
|
|
41
|
+
def __init__(self, subject: str):
|
|
42
|
+
super().__init__()
|
|
43
|
+
self.subject: str = subject
|
|
44
|
+
|
|
45
|
+
def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
|
|
46
|
+
"""
|
|
47
|
+
Process the dataset to create instances.
|
|
48
|
+
|
|
49
|
+
:param data: Hugging Face `Dataset` containing the data for a specific split.
|
|
50
|
+
:param split: The data split (e.g., "train", "test").
|
|
51
|
+
:return: A list of processed `Instance` objects.
|
|
52
|
+
"""
|
|
53
|
+
instances: List[Instance] = []
|
|
54
|
+
hlog(f"Processing data for {split} split")
|
|
55
|
+
for row in data:
|
|
56
|
+
id = row["question_id"]
|
|
57
|
+
question = row["question"]
|
|
58
|
+
answers = row["options"]
|
|
59
|
+
correct_choice = row["answer"]
|
|
60
|
+
answers_dict = dict(zip(["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"], answers))
|
|
61
|
+
correct_answer = answers_dict[correct_choice]
|
|
62
|
+
|
|
63
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
64
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
65
|
+
|
|
66
|
+
instance = Instance(
|
|
67
|
+
id=f"id{id}",
|
|
68
|
+
input=Input(text=question),
|
|
69
|
+
references=list(map(answer_to_reference, answers)),
|
|
70
|
+
split=split,
|
|
71
|
+
)
|
|
72
|
+
instances.append(instance)
|
|
73
|
+
return instances
|
|
74
|
+
|
|
75
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
76
|
+
"""
|
|
77
|
+
Load and process the MMLU-Pro dataset to create instances.
|
|
78
|
+
|
|
79
|
+
:param output_path: Path to save or output the processed instances.
|
|
80
|
+
:return: A list of all processed `Instance` objects.
|
|
81
|
+
"""
|
|
82
|
+
# Load the MMLU-Pro dataset from Hugging Face
|
|
83
|
+
dataset = load_dataset("TIGER-Lab/MMLU-Pro", revision="3373e0b")
|
|
84
|
+
|
|
85
|
+
# Process all the instances
|
|
86
|
+
instances: List[Instance] = []
|
|
87
|
+
splits: Dict[str, str] = {
|
|
88
|
+
"validation": TRAIN_SPLIT,
|
|
89
|
+
"test": TEST_SPLIT,
|
|
90
|
+
}
|
|
91
|
+
for hf_split, split in splits.items():
|
|
92
|
+
data = dataset[hf_split].filter(lambda x: self.subject == "all" or self.subject == x["category"])
|
|
93
|
+
instances.extend(self.process_dataset(data, split))
|
|
94
|
+
|
|
95
|
+
return instances
|
|
@@ -4,7 +4,17 @@ from typing import Dict, List
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from .scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
8
18
|
|
|
9
19
|
|
|
10
20
|
class MMLUScenario(Scenario):
|
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Tuple, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
8
|
from helm.common.hierarchical_logger import hlog
|
|
9
|
-
from .scenario import (
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
10
|
Scenario,
|
|
11
11
|
Instance,
|
|
12
12
|
Reference,
|