crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
# type: ignore
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
5
|
+
import transformers
|
|
6
|
+
import os
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import tiktoken
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
from typing import Any, Dict, Optional, Callable
|
|
12
|
+
|
|
13
|
+
from helm.common.general import check_file_exists
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_instructions(path_to_instructions: str) -> Dict[int, Dict[str, Any]]:
|
|
17
|
+
"""
|
|
18
|
+
Builds map from Instruction ID to instruction details
|
|
19
|
+
|
|
20
|
+
The needed information for creating the map is accomplished by reading
|
|
21
|
+
a CSV file from the user-specified path.
|
|
22
|
+
|
|
23
|
+
The CSV file is expected to contain at least the following columns:
|
|
24
|
+
- instruction_id: The ID of the instruction.
|
|
25
|
+
- question: The text of the instruction.
|
|
26
|
+
- person_id: The ID of the associated patient.
|
|
27
|
+
- is_selected_ehr: A flag indicating whether the instruction is selected.
|
|
28
|
+
|
|
29
|
+
See https://stanfordmedicine.box.com/s/0om9qav2sklb9vaitn0ibye65vgbfx0e
|
|
30
|
+
|
|
31
|
+
Parameters:
|
|
32
|
+
path_to_instructions (str): Path to CSV file containing instructions.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Dict[int, Dict[str, Any]]: A dictionary mapping instruction IDs to a
|
|
36
|
+
dictionary containing instruction text and associated patient ID.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
FileNotFoundError: If the specified file does not exist.
|
|
40
|
+
ValueError: If the CSV file does not contain the expected columns.
|
|
41
|
+
"""
|
|
42
|
+
if not os.path.exists(path_to_instructions):
|
|
43
|
+
raise FileNotFoundError(
|
|
44
|
+
f"The specified file {path_to_instructions} does not exist."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
instructions_df = pd.read_csv(path_to_instructions, sep='\t')
|
|
48
|
+
required_columns = {
|
|
49
|
+
"instruction_id",
|
|
50
|
+
"question",
|
|
51
|
+
"person_id",
|
|
52
|
+
}
|
|
53
|
+
if not required_columns.issubset(instructions_df.columns):
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"The CSV file is missing one or more of the required columns: {required_columns}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
selected_instructions_df = instructions_df #.query("is_selected_ehr == 'yes'")
|
|
59
|
+
instructions_map = {
|
|
60
|
+
row["instruction_id"]: {
|
|
61
|
+
"instruction": row["question"],
|
|
62
|
+
"patient_id": row["person_id"],
|
|
63
|
+
}
|
|
64
|
+
for _, row in selected_instructions_df.iterrows()
|
|
65
|
+
}
|
|
66
|
+
return instructions_map
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extract_patient_id_from_fname(fname: str) -> Optional[int]:
|
|
70
|
+
"""
|
|
71
|
+
Extracts and returns the patient ID from a given filename.
|
|
72
|
+
|
|
73
|
+
The function expects filenames in the format 'EHR_<patient_id>.xml',
|
|
74
|
+
where <patient_id> is a sequence of digits.
|
|
75
|
+
|
|
76
|
+
Parameters:
|
|
77
|
+
fname (str): The filename from which to extract the patient ID.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Optional[int]: The extracted patient ID as an integer, or None if
|
|
81
|
+
the filename doesn't match the expected format.
|
|
82
|
+
"""
|
|
83
|
+
name=fname.split('.')[0]
|
|
84
|
+
return int(name)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_ehrs(path_to_ehrs: str) -> Dict[int, str]:
|
|
88
|
+
"""
|
|
89
|
+
Builds a map from Instruction ID to EHR (Electronic Health Record) timeline.
|
|
90
|
+
|
|
91
|
+
EHR timelines are in string format and EHR files are read in from the
|
|
92
|
+
user-specified directory. Each file in the directory should be named
|
|
93
|
+
'EHR_<patient_id>.xml', where <patient_id> is a sequence of digits.
|
|
94
|
+
|
|
95
|
+
See https://stanfordmedicine.box.com/s/r28wfwwude9rpjtu0szhzegmku8qv2pe
|
|
96
|
+
|
|
97
|
+
Parameters:
|
|
98
|
+
path_to_ehrs (str): The path to the directory containing the EHR files.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dict[int, str]: A dictionary mapping patient IDs to EHR timelines.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
FileNotFoundError: If the specified directory does not exist.
|
|
105
|
+
"""
|
|
106
|
+
if not os.path.isdir(path_to_ehrs):
|
|
107
|
+
raise FileNotFoundError(
|
|
108
|
+
f"The specified directory {path_to_ehrs} does not exist."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
ehr_map = {}
|
|
112
|
+
for fname in os.listdir(path_to_ehrs):
|
|
113
|
+
pt_id = extract_patient_id_from_fname(fname)
|
|
114
|
+
if pt_id is None:
|
|
115
|
+
print(
|
|
116
|
+
f"Warning: File '{fname}' does not match the expected format "
|
|
117
|
+
"and will be skipped."
|
|
118
|
+
)
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
file_path = os.path.join(path_to_ehrs, fname)
|
|
122
|
+
with open(file_path, encoding="utf-8", mode="r") as f:
|
|
123
|
+
ehr = f.read()
|
|
124
|
+
|
|
125
|
+
ehr_map[pt_id] = ehr
|
|
126
|
+
return ehr_map
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_tokenizer(tokenizer_name: str) -> Callable:
|
|
130
|
+
"""
|
|
131
|
+
Returns a tokenizer based on the given tokenizer name.
|
|
132
|
+
|
|
133
|
+
Parameters:
|
|
134
|
+
tokenizer_name (str): The name of the tokenizer. Acceptable values are:
|
|
135
|
+
- "tiktoken"
|
|
136
|
+
- "chatgpt"
|
|
137
|
+
- "gpt-3.5-turbo"
|
|
138
|
+
- "gpt-4"
|
|
139
|
+
- "gpt-4-turbo"
|
|
140
|
+
- "gpt-4o"
|
|
141
|
+
- "cl100k_base"
|
|
142
|
+
- Any valid tokenizer name recognized by the transformers library.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Callable: The tokenizer instance.
|
|
146
|
+
"""
|
|
147
|
+
if tokenizer_name.lower() in [
|
|
148
|
+
"tiktoken",
|
|
149
|
+
"chatgpt",
|
|
150
|
+
"gpt-3.5-turbo",
|
|
151
|
+
"gpt-4",
|
|
152
|
+
"gpt-4-turbo",
|
|
153
|
+
"gpt-4o",
|
|
154
|
+
"cl100k_base",
|
|
155
|
+
]:
|
|
156
|
+
return tiktoken.get_encoding("cl100k_base")
|
|
157
|
+
print(tokenizer_name)
|
|
158
|
+
return transformers.AutoTokenizer.from_pretrained(tokenizer_name, legacy=False)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def pack_and_trim_prompts(
|
|
162
|
+
instructions: Dict[int, Dict[str, str]],
|
|
163
|
+
ehrs: Dict[int, str],
|
|
164
|
+
prompt_string: str,
|
|
165
|
+
context_length: int,
|
|
166
|
+
generation_length: int,
|
|
167
|
+
tokenizer: Any,
|
|
168
|
+
verbose: bool = False,
|
|
169
|
+
include_ehr: bool = True,
|
|
170
|
+
) -> Dict[int, str]:
|
|
171
|
+
"""
|
|
172
|
+
Returns:
|
|
173
|
+
A map from Instruction ID to prompt
|
|
174
|
+
"""
|
|
175
|
+
prompts_map = {}
|
|
176
|
+
for instruction_id in tqdm(instructions.keys()):
|
|
177
|
+
instruction = instructions[instruction_id]["instruction"]
|
|
178
|
+
patient_id = int(instructions[instruction_id]["patient_id"])
|
|
179
|
+
relevant_ehr = ehrs[patient_id]
|
|
180
|
+
|
|
181
|
+
num_tokens_instruction = len(tokenizer.encode(instruction))
|
|
182
|
+
num_tokens_prompt_template = len(tokenizer.encode(prompt_string))
|
|
183
|
+
if include_ehr:
|
|
184
|
+
target_ehr_length = context_length - generation_length - num_tokens_prompt_template - num_tokens_instruction
|
|
185
|
+
else:
|
|
186
|
+
target_ehr_length = 0
|
|
187
|
+
if target_ehr_length <= 0:
|
|
188
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr="")
|
|
189
|
+
else:
|
|
190
|
+
# Do a first pass with a fast tokenizer
|
|
191
|
+
fast_tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
192
|
+
fast_encoded = fast_tokenizer.encode(relevant_ehr)
|
|
193
|
+
if len(fast_encoded) <= target_ehr_length:
|
|
194
|
+
fast_encoded_truncated = fast_encoded[-(2 * target_ehr_length) :]
|
|
195
|
+
fast_truncated_ehr = fast_tokenizer.decode(fast_encoded_truncated)
|
|
196
|
+
|
|
197
|
+
# Then do a second pass with the actual tokenizer
|
|
198
|
+
encoded_ehr = tokenizer.encode(fast_truncated_ehr)
|
|
199
|
+
truncated_encoded_ehr = encoded_ehr[-target_ehr_length:]
|
|
200
|
+
truncated_ehr = tokenizer.decode(truncated_encoded_ehr)
|
|
201
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
|
|
202
|
+
else:
|
|
203
|
+
# If the fast encoding is still too long, just use the full EHR up to allowed length
|
|
204
|
+
truncated_ehr = fast_tokenizer.decode(fast_encoded[-target_ehr_length:])
|
|
205
|
+
prompt_with_truncated_ehr = prompt_string.format(question=instruction, ehr=truncated_ehr)
|
|
206
|
+
|
|
207
|
+
prompts_map[instruction_id] = prompt_with_truncated_ehr
|
|
208
|
+
|
|
209
|
+
if verbose:
|
|
210
|
+
print(prompt_with_truncated_ehr)
|
|
211
|
+
print("~" * 20)
|
|
212
|
+
return prompts_map
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def preprocess_prompts(
|
|
216
|
+
target_context_length,
|
|
217
|
+
generation_length,
|
|
218
|
+
path_to_instructions,
|
|
219
|
+
path_to_ehrs,
|
|
220
|
+
include_ehr,
|
|
221
|
+
tokenizer,
|
|
222
|
+
codes_only=False,
|
|
223
|
+
notes_only=False,
|
|
224
|
+
):
|
|
225
|
+
print(
|
|
226
|
+
f"\n\twith target context length = {target_context_length} "
|
|
227
|
+
f"\n\twith target generation length = {generation_length} "
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# FETCH INSTRUCTIONS
|
|
231
|
+
print("Fetching instructions...")
|
|
232
|
+
instructions = get_instructions(path_to_instructions)
|
|
233
|
+
|
|
234
|
+
# FETCH RELEVANT EHRs #
|
|
235
|
+
print("Fetching patient EHR timelines...")
|
|
236
|
+
ehrs = get_ehrs(path_to_ehrs)
|
|
237
|
+
|
|
238
|
+
# LOAD TOKENIZER #
|
|
239
|
+
print("Loading tokenizer...")
|
|
240
|
+
tokenizer = get_tokenizer(tokenizer)
|
|
241
|
+
|
|
242
|
+
# CONSTRUCT & TRUNCATE PROMPTS #
|
|
243
|
+
print("Constructing prompts using instructions and EHRs...")
|
|
244
|
+
prompt_string = (
|
|
245
|
+
"Instruction: Answer the following question based on the EHR:\n\n"
|
|
246
|
+
"EHR: {ehr}\n\nQuestion: {question}\n\nAnswer:"
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
filled_prompts = pack_and_trim_prompts(
|
|
250
|
+
instructions=instructions,
|
|
251
|
+
ehrs=ehrs,
|
|
252
|
+
prompt_string=prompt_string,
|
|
253
|
+
context_length=target_context_length,
|
|
254
|
+
generation_length=generation_length,
|
|
255
|
+
tokenizer=tokenizer,
|
|
256
|
+
verbose=False,
|
|
257
|
+
include_ehr=include_ehr,
|
|
258
|
+
)
|
|
259
|
+
assert filled_prompts, f"No prompts were found for length: {target_context_length}. Try again with a larger length."
|
|
260
|
+
# SAVE CONSTRUCTED PROMPTS TO DISK
|
|
261
|
+
df_rows = []
|
|
262
|
+
for instruction_id in tqdm(filled_prompts.keys()):
|
|
263
|
+
row = {}
|
|
264
|
+
row["instruction_id"] = instruction_id
|
|
265
|
+
patient_id = instructions[instruction_id]["patient_id"]
|
|
266
|
+
row["patient_id"] = patient_id
|
|
267
|
+
row["instruction"] = instructions[instruction_id]["instruction"]
|
|
268
|
+
row["ehr"] = "".join(ehrs[patient_id])
|
|
269
|
+
row["prompt"] = filled_prompts[instruction_id]
|
|
270
|
+
row["context_length"] = target_context_length
|
|
271
|
+
row["generation_length"] = generation_length
|
|
272
|
+
df_rows.append(row)
|
|
273
|
+
|
|
274
|
+
prompts_df = pd.DataFrame(df_rows)
|
|
275
|
+
instructionid_to_prompt_map = (
|
|
276
|
+
prompts_df[["instruction_id", "prompt"]].set_index("instruction_id").to_dict().get("prompt")
|
|
277
|
+
)
|
|
278
|
+
instructionid_to_prompt_df = (
|
|
279
|
+
pd.DataFrame.from_dict(instructionid_to_prompt_map, orient="index", columns=["prompt"])
|
|
280
|
+
.reset_index()
|
|
281
|
+
.rename(columns={"index": "instruction_id"})
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
print("...Prompt construction complete")
|
|
285
|
+
return instructionid_to_prompt_df
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def add_reference_responses(prompts_df, path_to_reference_responses) -> pd.DataFrame:
|
|
289
|
+
"""
|
|
290
|
+
Processes a single file for evaluation.
|
|
291
|
+
|
|
292
|
+
Parameters:
|
|
293
|
+
file_path (str): Path to the file to be processed.
|
|
294
|
+
args (argparse.Namespace): Command line arguments passed to the script.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
pd.DataFrame: DataFrame containing the processed data.
|
|
298
|
+
"""
|
|
299
|
+
gold_df = pd.read_csv(path_to_reference_responses, sep='\t')
|
|
300
|
+
gold_df = gold_df.query("annotator_num == 'Annotator_1'")
|
|
301
|
+
gold_df = gold_df[["instruction_id", "clinician_response"]]
|
|
302
|
+
merged_df = gold_df.merge(prompts_df, on="instruction_id", how="inner")
|
|
303
|
+
return merged_df
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
|
|
307
|
+
target_context_length = max_length
|
|
308
|
+
generation_length = 256
|
|
309
|
+
path_to_instructions = os.path.join(data_path, "clinician-reviewed-model-responses.tsv")
|
|
310
|
+
check_file_exists(path_to_instructions, msg=f"[MedAlignScenario] Required instructions file not found: '{path_to_instructions}'")
|
|
311
|
+
path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
|
|
312
|
+
path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
|
|
313
|
+
check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
|
|
314
|
+
include_ehr = True
|
|
315
|
+
tokenizer = "tiktoken"
|
|
316
|
+
|
|
317
|
+
instructionid_to_prompt_df = preprocess_prompts(
|
|
318
|
+
target_context_length=target_context_length,
|
|
319
|
+
generation_length=generation_length,
|
|
320
|
+
path_to_instructions=path_to_instructions,
|
|
321
|
+
path_to_ehrs=path_to_ehrs,
|
|
322
|
+
include_ehr=include_ehr,
|
|
323
|
+
tokenizer=tokenizer,
|
|
324
|
+
)
|
|
325
|
+
medalign_dataframe = add_reference_responses(instructionid_to_prompt_df, path_to_reference_responses)
|
|
326
|
+
return medalign_dataframe
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import csv
|
|
3
|
+
import sys
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Input,
|
|
11
|
+
Instance,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
ScenarioMetadata,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.general import ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
csv.field_size_limit(sys.maxsize)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MedBulletsScenario(Scenario):
|
|
23
|
+
"""
|
|
24
|
+
From "Benchmarking Large Language Models on Answering and Explaining Challenging Medical Questions"
|
|
25
|
+
(Chen et al.), MedBullet is a dataset comprising USMLE Step 2&3 style clinical questions. The dataset
|
|
26
|
+
is designed to evaluate the performance of LLMs in answering and explaining challenging medical questions,
|
|
27
|
+
emphasizing the need for explainable AI in medical QA.
|
|
28
|
+
|
|
29
|
+
Example from the dataset:
|
|
30
|
+
|
|
31
|
+
Question:
|
|
32
|
+
A 42-year-old woman is enrolled in a randomized controlled trial to study cardiac function in the setting of
|
|
33
|
+
several different drugs. She is started on verapamil and instructed to exercise at 50% of her VO2 max while
|
|
34
|
+
several cardiac parameters are being measured. During this experiment, which of the following represents
|
|
35
|
+
the relative conduction speed through the heart from fastest to slowest?
|
|
36
|
+
|
|
37
|
+
A) AV node > ventricles > atria > Purkinje fibers
|
|
38
|
+
B) Purkinje fibers > ventricles > atria > AV node
|
|
39
|
+
C) Purkinje fibers > atria > ventricles > AV node
|
|
40
|
+
D) Purkinje fibers > AV node > ventricles > atria
|
|
41
|
+
|
|
42
|
+
Answer:
|
|
43
|
+
The answer is C. Explanation: The conduction velocity of the structures of the heart is in the following order:
|
|
44
|
+
Purkinje fibers > atria > ventricles > AV node. A calcium channel blocker such as verapamil would only slow
|
|
45
|
+
conduction in the AV node.
|
|
46
|
+
|
|
47
|
+
@Article{MedBullet,
|
|
48
|
+
author = {Hanjie Chen and Zhouxiang Fang and Yash Singla and Mark Dredze},
|
|
49
|
+
title = {Benchmarking Large Language Models on Answering and Explaining Challenging Medical Questions},
|
|
50
|
+
year = {2023},
|
|
51
|
+
abstract = {LLMs have demonstrated impressive performance in answering medical questions, such as passing scores
|
|
52
|
+
on medical licensing examinations. However, medical board exam questions or general clinical questions do not
|
|
53
|
+
capture the complexity of realistic clinical cases. Moreover, the lack of reference explanations means we cannot
|
|
54
|
+
easily evaluate the reasoning of model decisions, a crucial component of supporting doctors in making complex
|
|
55
|
+
medical decisions. To address these challenges, we construct two new datasets: JAMA Clinical Challenge and
|
|
56
|
+
Medbullets. JAMA Clinical Challenge consists of questions based on challenging clinical cases, while Medbullets
|
|
57
|
+
comprises USMLE Step 2&3 style clinical questions. Both datasets are structured as multiple-choice
|
|
58
|
+
question-answering tasks, where each question is accompanied by an expert-written explanation. We evaluate four
|
|
59
|
+
LLMs on the two datasets using various prompts. Experiments demonstrate that our datasets are harder than
|
|
60
|
+
previous benchmarks. The inconsistency between automatic and human evaluations of model-generated explanations
|
|
61
|
+
highlights the need to develop new metrics to support future research on explainable medical QA.}}
|
|
62
|
+
|
|
63
|
+
Task:
|
|
64
|
+
Given a clinical question with multiple-choice options, models must identify the correct answer and generate a
|
|
65
|
+
response that includes the reasoning, as described in the expert-written explanation.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
DATASET_DOWNLOAD_BASE_URL = (
|
|
69
|
+
"https://raw.githubusercontent.com/HanjieChen/ChallengeClinicalQA/refs/heads/main/medbullets/"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
name = "medbullets"
|
|
73
|
+
description = (
|
|
74
|
+
"Medbullets is a benchmark of USMLE-style medical questions designed to assess a"
|
|
75
|
+
"model’s ability to understand and apply clinical knowledge. Each question is accompanied"
|
|
76
|
+
"by a patient scenario and five multiple-choice options, similar to those found on"
|
|
77
|
+
"Step 2 and Step 3 on the US medical licensing exam."
|
|
78
|
+
)
|
|
79
|
+
tags = ["reasoning", "biomedical"]
|
|
80
|
+
|
|
81
|
+
# Define the possible answer choices
|
|
82
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C", "D", "E"]
|
|
83
|
+
|
|
84
|
+
def __init__(self):
|
|
85
|
+
super().__init__()
|
|
86
|
+
# self.splits = {"_op4": TRAIN_SPLIT, "_op5": TEST_SPLIT}
|
|
87
|
+
# limit to zero shot setting for now
|
|
88
|
+
self.splits = {"_op5": TEST_SPLIT}
|
|
89
|
+
|
|
90
|
+
def download_csv(self, output_path: str, split: str):
|
|
91
|
+
"""Download CSV files for the given split."""
|
|
92
|
+
csv_path = os.path.join(output_path, f"medbullets{split}.csv")
|
|
93
|
+
ensure_file_downloaded(
|
|
94
|
+
source_url=f"{self.DATASET_DOWNLOAD_BASE_URL}/medbullets{split}.csv",
|
|
95
|
+
target_path=csv_path,
|
|
96
|
+
unpack=False,
|
|
97
|
+
)
|
|
98
|
+
return csv_path
|
|
99
|
+
|
|
100
|
+
def process_csv(self, csv_path: str, split: str) -> List[Instance]:
|
|
101
|
+
"""Read and process a CSV file to generate instances."""
|
|
102
|
+
instances: List[Instance] = []
|
|
103
|
+
with open(csv_path, "r", encoding="utf-8") as f:
|
|
104
|
+
reader = csv.DictReader(f)
|
|
105
|
+
for row in reader:
|
|
106
|
+
# Validate required fields
|
|
107
|
+
if not row.get("question") or not row.get("answer_idx") or not row.get("opa"):
|
|
108
|
+
print(f"Skipping invalid row: {row}")
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
# Map answers to indices
|
|
112
|
+
option_map = {
|
|
113
|
+
"A": row.get("opa", "Not applicable"),
|
|
114
|
+
"B": row.get("opb", "Not applicable"),
|
|
115
|
+
"C": row.get("opc", "Not applicable"),
|
|
116
|
+
"D": row.get("opd", "Not applicable"),
|
|
117
|
+
"E": row.get("ope", "Not applicable"),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Correct answer text
|
|
121
|
+
correct_option = row["answer_idx"]
|
|
122
|
+
|
|
123
|
+
# Build references using POSSIBLE_ANSWER_CHOICES
|
|
124
|
+
references = [
|
|
125
|
+
Reference(
|
|
126
|
+
Output(text=option_map.get(option, "Not applicable")),
|
|
127
|
+
tags=[CORRECT_TAG] if option == correct_option else [],
|
|
128
|
+
)
|
|
129
|
+
for option in self.POSSIBLE_ANSWER_CHOICES
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
# Create instance
|
|
133
|
+
instance = Instance(
|
|
134
|
+
input=Input(text=row["question"]),
|
|
135
|
+
references=references,
|
|
136
|
+
split=split,
|
|
137
|
+
)
|
|
138
|
+
instances.append(instance)
|
|
139
|
+
return instances
|
|
140
|
+
|
|
141
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
142
|
+
"""Download and process dataset to generate instances."""
|
|
143
|
+
instances: List[Instance] = []
|
|
144
|
+
for split_suffix, split in self.splits.items():
|
|
145
|
+
csv_path = self.download_csv(output_path, split_suffix)
|
|
146
|
+
instances.extend(self.process_csv(csv_path, split))
|
|
147
|
+
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self):
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="medbullets",
|
|
152
|
+
display_name="Medbullets",
|
|
153
|
+
description="Medbullets is a benchmark of USMLE-style medical questions designed to assess "
|
|
154
|
+
"a model's ability to understand and apply clinical knowledge. Each question is "
|
|
155
|
+
"accompanied by a patient scenario and five multiple-choice options, similar to "
|
|
156
|
+
"those found on Step 2 and Step 3 board exams [(MedBullets, "
|
|
157
|
+
"2025)](https://step2.medbullets.com).",
|
|
158
|
+
taxonomy=TaxonomyInfo(
|
|
159
|
+
task="Question answering",
|
|
160
|
+
what="Medical knowledge testing",
|
|
161
|
+
when="Any",
|
|
162
|
+
who="Medical student, . Researcher",
|
|
163
|
+
language="English",
|
|
164
|
+
),
|
|
165
|
+
main_metric="exact_match",
|
|
166
|
+
main_split="test",
|
|
167
|
+
)
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
from datasets import load_dataset
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
|
+
from helm.common.hierarchical_logger import hlog
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
PassageQuestionInput,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MedCalcBenchScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
MedCalc-Bench is the first medical calculation dataset used to benchmark
|
|
21
|
+
LLMs ability to serve as clinical calculators.
|
|
22
|
+
Each instance in the dataset consists of a patient note, a question asking
|
|
23
|
+
to compute a specific clinical value, a final answer value, and a step-by-step
|
|
24
|
+
solution explaining how the final answer was obtained. Our dataset covers 55
|
|
25
|
+
different calculation tasks. We hope this dataset serves as a call to improve
|
|
26
|
+
the verbal and computational reasoning skills of LLMs in medical settings.
|
|
27
|
+
|
|
28
|
+
This dataset contains a training dataset of 10,053 instances and a testing
|
|
29
|
+
dataset of 1,047 instances.
|
|
30
|
+
|
|
31
|
+
Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.0
|
|
32
|
+
Paper: https://arxiv.org/abs/2406.12036
|
|
33
|
+
|
|
34
|
+
Sample Prompt:
|
|
35
|
+
Given a patient note and a clinical question, compute the requested medical value.
|
|
36
|
+
Be as concise as possible.
|
|
37
|
+
|
|
38
|
+
Patient note: A 70-year-old female was rushed into the ICU due to respiratory distress,
|
|
39
|
+
following which she was promptly put on mechanical ventilation. Her delivered oxygen fell
|
|
40
|
+
to 51 % FiO₂; meanwhile, her partial pressure of oxygen (PaO₂) registered at 74 mm Hg.
|
|
41
|
+
She was conscious but visibly disoriented with a functional Glasgow Coma Score of 12.
|
|
42
|
+
She was hypotensive with blood pressure of 91/70 mm Hg. Multiple vasopressors are being administered
|
|
43
|
+
simultaneously including DOPamine at 4 mcg/kg/min, norEPINEPHrine at 0.06 mcg/kg/min,
|
|
44
|
+
DOBUTamine at 3 mcg/kg/min, and EPINEPHrine at 0.03 mcg/kg/min. Laboratory evaluations
|
|
45
|
+
revealed mild renal impairment with creatinine levels slightly elevated at 1.6 mg/dL
|
|
46
|
+
and a bilirubin level of 1.9 mg/dL. Her platelet count was found to be 165,000/µL.
|
|
47
|
+
Her daily urine output of 950 mL.
|
|
48
|
+
Question: What is the patient's Sequential Organ Failure Assessment (SOFA) Score?
|
|
49
|
+
|
|
50
|
+
Answer:
|
|
51
|
+
|
|
52
|
+
@misc{khandekar2024medcalcbench,
|
|
53
|
+
title={MedCalc-Bench: Evaluating Large Language Models for Medical Calculations},
|
|
54
|
+
author={
|
|
55
|
+
Nikhil Khandekar and Qiao Jin and Guangzhi Xiong and Soren Dunn and Serina S Applebaum and
|
|
56
|
+
Zain Anwar and Maame Sarfo-Gyamfi and Conrad W Safranek and Abid A Anwar and Andrew Zhang and
|
|
57
|
+
Aidan Gilson and Maxwell B Singer and Amisha Dave and Andrew Taylor and Aidong Zhang and
|
|
58
|
+
Qingyu Chen and Zhiyong Lu
|
|
59
|
+
},
|
|
60
|
+
year={2024},
|
|
61
|
+
eprint={2406.12036},
|
|
62
|
+
archivePrefix={arXiv},
|
|
63
|
+
primaryClass={
|
|
64
|
+
id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg'
|
|
65
|
+
in_archive='cs' is_general=False description='Covers natural language processing.
|
|
66
|
+
Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial
|
|
67
|
+
languages (programming languages, logics, formal systems) that does not explicitly
|
|
68
|
+
address natural-language issues broadly construed (natural-language processing, computational
|
|
69
|
+
linguistics, speech, text retrieval, etc.) is not appropriate for this area.'
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
name = "medcalc_bench"
|
|
75
|
+
description = (
|
|
76
|
+
"MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute"
|
|
77
|
+
"clinically relevant values from patient notes. Each instance consists of a clinical note"
|
|
78
|
+
"describing the patient's condition, a diagnostic question targeting a specific medical"
|
|
79
|
+
"value, and a ground truth response."
|
|
80
|
+
)
|
|
81
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
82
|
+
|
|
83
|
+
def __init__(self):
|
|
84
|
+
super().__init__()
|
|
85
|
+
|
|
86
|
+
def process_csv(self, data, split: str) -> List[Instance]:
|
|
87
|
+
instances: List[Instance] = []
|
|
88
|
+
hlog(f"Processing data for {split} split")
|
|
89
|
+
for row in data:
|
|
90
|
+
question = row["Question"]
|
|
91
|
+
ground_truth_answer = row["Ground Truth Answer"]
|
|
92
|
+
patient_note = row["Patient Note"]
|
|
93
|
+
id = row["Row Number"]
|
|
94
|
+
|
|
95
|
+
prompt = PassageQuestionInput(
|
|
96
|
+
passage=patient_note + "\n", question=question + "\n", passage_prefix="Patient note: "
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
extra_data = {
|
|
100
|
+
"category": row["Category"],
|
|
101
|
+
"upper_limit": row["Upper Limit"],
|
|
102
|
+
"lower_limit": row["Lower Limit"],
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
instance = Instance(
|
|
106
|
+
input=prompt,
|
|
107
|
+
references=[Reference(Output(text=ground_truth_answer), tags=[CORRECT_TAG])],
|
|
108
|
+
extra_data=extra_data,
|
|
109
|
+
split=split,
|
|
110
|
+
id=id,
|
|
111
|
+
)
|
|
112
|
+
instances.append(instance)
|
|
113
|
+
return instances
|
|
114
|
+
|
|
115
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
116
|
+
# Load the MedCalc-Bench dataset from Hugging Face
|
|
117
|
+
dataset = load_dataset("ncbi/MedCalc-Bench-v1.0")
|
|
118
|
+
|
|
119
|
+
# Process all the instances - limit to zero shot setting
|
|
120
|
+
instances: List[Instance] = []
|
|
121
|
+
splits: Dict[str, str] = {
|
|
122
|
+
# "train": TRAIN_SPLIT,
|
|
123
|
+
"test": TEST_SPLIT,
|
|
124
|
+
}
|
|
125
|
+
for hf_split, split in splits.items():
|
|
126
|
+
data = dataset[hf_split]
|
|
127
|
+
instances.extend(self.process_csv(data, split))
|
|
128
|
+
|
|
129
|
+
return instances
|
|
130
|
+
|
|
131
|
+
def get_metadata(self):
|
|
132
|
+
return ScenarioMetadata(
|
|
133
|
+
name="medcalc_bench",
|
|
134
|
+
display_name="MedCalc-Bench",
|
|
135
|
+
description="MedCalc-Bench is a benchmark designed to evaluate models on their ability to "
|
|
136
|
+
"compute clinically relevant values from patient notes. Each instance consists "
|
|
137
|
+
"of a clinical note describing the patient's condition, a diagnostic question "
|
|
138
|
+
"targeting a specific medical value, and a ground truth response. [(Khandekar "
|
|
139
|
+
"et al., 2024)](https://arxiv.org/abs/2406.12036).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="Computational reasoning",
|
|
142
|
+
what="Compute a specific medical value from a patient note",
|
|
143
|
+
when="Any",
|
|
144
|
+
who="Clinician, Researcher",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="medcalc_bench_accuracy",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|