crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
ScenarioMetadata,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.general import ensure_directory_exists
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MTSamplesReplicateScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
MTSamples.com is designed to give you access to a big collection of transcribed medical reports.
|
|
21
|
+
These samples can be used by learning, as well as working medical transcriptionists for their daily
|
|
22
|
+
transcription needs. We present the model with patient information and request it to generate a corresponding
|
|
23
|
+
treatment plan.
|
|
24
|
+
|
|
25
|
+
Sample Synthetic Prompt:
|
|
26
|
+
Given various information about a patient, return a reasonable treatment plan for the patient.
|
|
27
|
+
|
|
28
|
+
- Extracts `PLAN`, `SUMMARY`, or `FINDINGS` as the reference (PLAN preferred).
|
|
29
|
+
- Removes `PLAN` from the input text but keeps other sections.
|
|
30
|
+
- Ignores files that do not contain any of these reference sections.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
GIT_HASH = "ebc104a4f96c5b7602242f301e081e9934a23344"
|
|
34
|
+
API_BASE_URL = (
|
|
35
|
+
f"https://api.github.com/repos/raulista1997/benchmarkdata/contents/mtsamples_processed?ref={GIT_HASH}"
|
|
36
|
+
)
|
|
37
|
+
RAW_BASE_URL = f"https://raw.githubusercontent.com/raulista1997/benchmarkdata/{GIT_HASH}/mtsamples_processed/"
|
|
38
|
+
|
|
39
|
+
name = "mtsamples_replicate"
|
|
40
|
+
description = (
|
|
41
|
+
"MTSamples Replicate is a benchmark that provides transcribed medical reports"
|
|
42
|
+
"from various specialties. It is used to evaluate a model's ability to generate clinically"
|
|
43
|
+
"appropriate treatment plans based on unstructured patient documentation"
|
|
44
|
+
)
|
|
45
|
+
tags = ["medical", "transcription", "plan_generation"]
|
|
46
|
+
|
|
47
|
+
def fetch_file_list(self) -> List[str]:
|
|
48
|
+
"""
|
|
49
|
+
Uses the GitHub API to fetch the list of `.txt` files at a specific commit.
|
|
50
|
+
"""
|
|
51
|
+
response = requests.get(self.API_BASE_URL)
|
|
52
|
+
if response.status_code != 200:
|
|
53
|
+
raise Exception(f"Failed to fetch file list from GitHub API: {response.text}")
|
|
54
|
+
|
|
55
|
+
files = response.json()
|
|
56
|
+
return [f["name"] for f in files if f["name"].endswith(".txt")]
|
|
57
|
+
|
|
58
|
+
def download_file(self, file_name: str, output_dir: str) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Downloads a text file from GitHub and saves it locally.
|
|
61
|
+
"""
|
|
62
|
+
file_url = self.RAW_BASE_URL + file_name
|
|
63
|
+
file_path = os.path.join(output_dir, file_name)
|
|
64
|
+
|
|
65
|
+
if not os.path.exists(file_path):
|
|
66
|
+
response = requests.get(file_url)
|
|
67
|
+
if response.status_code != 200:
|
|
68
|
+
raise Exception(f"Failed to download {file_url}")
|
|
69
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
70
|
+
f.write(response.text)
|
|
71
|
+
|
|
72
|
+
return file_path
|
|
73
|
+
|
|
74
|
+
def extract_sections(self, text: str) -> tuple:
|
|
75
|
+
"""
|
|
76
|
+
Extracts `PLAN`, `SUMMARY`, and `FINDINGS` sections from the text.
|
|
77
|
+
Returns (plan, summary, findings) as a tuple, ensuring uppercase detection.
|
|
78
|
+
"""
|
|
79
|
+
plan, summary, findings = None, None, None
|
|
80
|
+
text_upper = text.upper()
|
|
81
|
+
|
|
82
|
+
if "PLAN:" in text_upper:
|
|
83
|
+
plan = text.split("PLAN:")[1].split("\n", 1)[0].strip()
|
|
84
|
+
|
|
85
|
+
if "SUMMARY:" in text_upper:
|
|
86
|
+
summary = text.split("SUMMARY:")[1].split("\n", 1)[0].strip()
|
|
87
|
+
|
|
88
|
+
if "FINDINGS:" in text_upper:
|
|
89
|
+
findings = text.split("FINDINGS:")[1].split("\n", 1)[0].strip()
|
|
90
|
+
|
|
91
|
+
return plan, summary, findings
|
|
92
|
+
|
|
93
|
+
def remove_plan_section(self, text: str) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Removes `PLAN:` section from the input text while keeping the rest.
|
|
96
|
+
"""
|
|
97
|
+
sections = ["PLAN:"]
|
|
98
|
+
for section in sections:
|
|
99
|
+
if section in text:
|
|
100
|
+
text = text.split(section)[0].strip() # Keep content before PLAN
|
|
101
|
+
return text
|
|
102
|
+
|
|
103
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
104
|
+
"""
|
|
105
|
+
Downloads, processes, and converts MTSamples data into HELM format.
|
|
106
|
+
"""
|
|
107
|
+
ensure_directory_exists(output_path)
|
|
108
|
+
|
|
109
|
+
# Fetch list of available files from GitHub
|
|
110
|
+
file_list = self.fetch_file_list()
|
|
111
|
+
|
|
112
|
+
instances = []
|
|
113
|
+
for file_name in file_list:
|
|
114
|
+
try:
|
|
115
|
+
# Download the text file
|
|
116
|
+
file_path = self.download_file(file_name, output_path)
|
|
117
|
+
|
|
118
|
+
# Read content
|
|
119
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
120
|
+
text_content = f.read().strip()
|
|
121
|
+
|
|
122
|
+
# Extract structured sections
|
|
123
|
+
plan, summary, findings = self.extract_sections(text_content)
|
|
124
|
+
|
|
125
|
+
# Use plan > summary > findings as reference text
|
|
126
|
+
reference_text = plan or summary or findings
|
|
127
|
+
if not reference_text:
|
|
128
|
+
continue # Ignore notes with no reference section
|
|
129
|
+
|
|
130
|
+
# Remove PLAN section from input
|
|
131
|
+
cleaned_text = self.remove_plan_section(text_content)
|
|
132
|
+
|
|
133
|
+
# Create HELM instance
|
|
134
|
+
instances.append(
|
|
135
|
+
Instance(
|
|
136
|
+
input=Input(text=cleaned_text), # Processed text without PLAN
|
|
137
|
+
references=[Reference(Output(text=reference_text), tags=[CORRECT_TAG])],
|
|
138
|
+
split=TEST_SPLIT,
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"Error processing {file_name}: {e}")
|
|
143
|
+
|
|
144
|
+
return instances
|
|
145
|
+
|
|
146
|
+
def get_metadata(self):
|
|
147
|
+
return ScenarioMetadata(
|
|
148
|
+
name="mtsamples_replicate",
|
|
149
|
+
display_name="MTSamples",
|
|
150
|
+
short_display_name="MTSamples",
|
|
151
|
+
description="MTSamples Replicate is a benchmark that provides transcribed medical reports "
|
|
152
|
+
"from various specialties. It is used to evaluate a model's ability to generate "
|
|
153
|
+
"clinically appropriate treatment plans based on unstructured patient "
|
|
154
|
+
"documentation [(MTSamples, 2025)](https://mtsamples.com).",
|
|
155
|
+
taxonomy=TaxonomyInfo(
|
|
156
|
+
task="Text generation",
|
|
157
|
+
what="Generate treatment plans based on clinical notes",
|
|
158
|
+
when="Post-diagnosis",
|
|
159
|
+
who="Clinician",
|
|
160
|
+
language="English",
|
|
161
|
+
),
|
|
162
|
+
main_metric="mtsamples_replicate_accuracy",
|
|
163
|
+
main_split="test",
|
|
164
|
+
)
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
|
+
from helm.common.general import ensure_directory_exists
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
Input,
|
|
13
|
+
Scenario,
|
|
14
|
+
Instance,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Reference,
|
|
17
|
+
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
ORIGINAL_DEFINITIONS = {
|
|
22
|
+
"ABDOMINAL": "History of intra-abdominal surgery, small or large intestine resection, or small bowel obstruction",
|
|
23
|
+
"ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define “advanced” \
|
|
24
|
+
as having 2 or more of the following: • Taking 2 or more medications to treat CAD • \
|
|
25
|
+
History of myocardial infarction (MI) • Currently experiencing angina • Ischemia, past or present",
|
|
26
|
+
"ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
|
|
27
|
+
"ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)",
|
|
28
|
+
"CREATININE": "Serum creatinine level above the upper normal limit",
|
|
29
|
+
"DIETSUPP-2MOS": "Taken a dietary supplement (excluding vitamin D) in the past 2 months",
|
|
30
|
+
"DRUG-ABUSE": "Current or past history of drug abuse",
|
|
31
|
+
"ENGLISH": "Patient must speak English",
|
|
32
|
+
"HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
|
|
33
|
+
"KETO-1YR": "Diagnosis of ketoacidosis within the past year",
|
|
34
|
+
"MAJOR-DIABETES": "Major diabetes-related complication. For the purposes of this annotation, we define \
|
|
35
|
+
“major complication” (as opposed to “minor complication”) as any of the following that are a result of \
|
|
36
|
+
(or strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • \
|
|
37
|
+
Retinopathy • nephropathy • neuropathy",
|
|
38
|
+
"MAKES-DECISIONS": "Patient must make their own medical decisions",
|
|
39
|
+
"MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
|
|
40
|
+
}
|
|
41
|
+
# Custom definitions for better prompts
|
|
42
|
+
LONG_DEFINITIONS = {
|
|
43
|
+
"ABDOMINAL": "History of intra-abdominal surgery. This could include any form of intra-abdominal surgery, \
|
|
44
|
+
including but not limited to small/large intestine resection or small bowel obstruction",
|
|
45
|
+
"ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define \
|
|
46
|
+
“advanced” as having 2 or more of the following: (a) Taking 2 or more medications to treat CAD (b) History \
|
|
47
|
+
of myocardial infarction (MI) (c) Currently experiencing angina (d) Ischemia, past or present. \
|
|
48
|
+
The patient must have at least 2 of these categories (a,b,c,d) to meet this criterion, otherwise the patient \
|
|
49
|
+
does not meet this criterion. For ADVANCED-CAD, be strict in your evaluation of the patient -- if they just \
|
|
50
|
+
have cardiovascular disease, then they do not meet this criterion.",
|
|
51
|
+
"ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
|
|
52
|
+
"ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)..",
|
|
53
|
+
"CREATININE": "Serum creatinine level above the upper normal limit",
|
|
54
|
+
"DIETSUPP-2MOS": "Consumption of a dietary supplement (excluding vitamin D) in the past 2 months. To assess \
|
|
55
|
+
this criterion, go through the list of medications_and_supplements taken from the note. If a substance could \
|
|
56
|
+
potentially be used as a dietary supplement (i.e. it is commonly used as a dietary supplement, even if it \
|
|
57
|
+
is not explicitly stated as being used as a dietary supplement), then the patient meets this criterion. \
|
|
58
|
+
Be lenient and broad in what is considered a dietary supplement. For example, a 'multivitamin' and \
|
|
59
|
+
'calcium carbonate' should always be considered a dietary supplement if they are included in this list.",
|
|
60
|
+
"DRUG-ABUSE": "Current or past history of drug abuse",
|
|
61
|
+
"ENGLISH": "Patient speaks English. Assume that the patient speaks English, unless otherwise explicitly noted. \
|
|
62
|
+
If the patient's language is not mentioned in the note, then assume they speak English and thus meet \
|
|
63
|
+
this criteria.",
|
|
64
|
+
"HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
|
|
65
|
+
"KETO-1YR": "Diagnosis of ketoacidosis within the past year",
|
|
66
|
+
"MAJOR-DIABETES": "Major diabetes-related complication. Examples of “major complication” (as opposed to \
|
|
67
|
+
“minor complication”) include, but are not limited to, any of the following that are a result of (or \
|
|
68
|
+
strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • Retinopathy \
|
|
69
|
+
• nephropathy • neuropathy. Additionally, if multiple conditions together imply a severe case of diabetes, \
|
|
70
|
+
then count that as a major complication.",
|
|
71
|
+
"MAKES-DECISIONS": "Patient must make their own medical decisions. Assume that the patient makes their own \
|
|
72
|
+
medical decisions, unless otherwise explicitly noted. There is no information provided about the \
|
|
73
|
+
patient's ability to make their own medical decisions, then assume they do make their own decisions and \
|
|
74
|
+
therefore meet this criteria.\"",
|
|
75
|
+
"MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class XMLDataLoader:
|
|
80
|
+
def __init__(
|
|
81
|
+
self, path_to_folder: str, is_convert_to_numbers=True, is_split_text=True, is_remove_excessive_new_lines=True
|
|
82
|
+
):
|
|
83
|
+
self.path_to_folder = path_to_folder
|
|
84
|
+
self.is_convert_to_numbers = is_convert_to_numbers
|
|
85
|
+
self.is_split_text = is_split_text
|
|
86
|
+
self.is_remove_excessive_new_lines = is_remove_excessive_new_lines
|
|
87
|
+
|
|
88
|
+
def load_data(self) -> List[Dict[str, Any]]:
|
|
89
|
+
"""Main function: Data loader for the XML files"""
|
|
90
|
+
data = []
|
|
91
|
+
file_names = os.listdir(self.path_to_folder)
|
|
92
|
+
file_names = sorted([file for file in file_names if file.endswith(".xml")])
|
|
93
|
+
for file_name in file_names:
|
|
94
|
+
file_path = os.path.join(self.path_to_folder, file_name)
|
|
95
|
+
text, labels = self.parse_xml(file_path)
|
|
96
|
+
data.append({"patient_id": file_name.replace(".xml", ""), "ehr": text, "labels": labels})
|
|
97
|
+
|
|
98
|
+
return data
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def get_date_of_note(patient: Dict[str, Any], note_idx: int) -> Optional[str]:
|
|
102
|
+
"""Get date of note for patient"""
|
|
103
|
+
assert note_idx <= len(patient["ehr"]), f"{note_idx} out of bounds for {patient['patient_id']}"
|
|
104
|
+
note: str = patient["ehr"][note_idx]
|
|
105
|
+
match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
|
|
106
|
+
date = match.group(1) if match else None
|
|
107
|
+
if not date:
|
|
108
|
+
print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
|
|
109
|
+
return date
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def get_current_date_for_patient(patient: Dict[str, Any]) -> Optional[str]:
|
|
113
|
+
"""Get most recent date visible in files for a given patient"""
|
|
114
|
+
most_recent_date = None
|
|
115
|
+
for note in patient["ehr"]:
|
|
116
|
+
match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
|
|
117
|
+
most_recent_date = match.group(1) if match else most_recent_date
|
|
118
|
+
if not most_recent_date:
|
|
119
|
+
print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
|
|
120
|
+
return most_recent_date
|
|
121
|
+
|
|
122
|
+
def parse_xml(self, XML_file) -> Tuple[List[str], Dict[str, str]]:
|
|
123
|
+
tree = ET.parse(XML_file)
|
|
124
|
+
root = tree.getroot()
|
|
125
|
+
text_content = ""
|
|
126
|
+
result_text: List[str] = []
|
|
127
|
+
tags = {}
|
|
128
|
+
for elem in root.iter():
|
|
129
|
+
if elem.tag == "TEXT":
|
|
130
|
+
text_content = elem.text if elem.text else ""
|
|
131
|
+
if self.is_remove_excessive_new_lines:
|
|
132
|
+
text_content = self.remove_excessive_newlines(text_content)
|
|
133
|
+
if self.is_split_text:
|
|
134
|
+
result_text = self.split_text(text_content)
|
|
135
|
+
else:
|
|
136
|
+
result_text = [text_content]
|
|
137
|
+
elif elem.tag == "TAGS":
|
|
138
|
+
tags = self.read_tags(root)
|
|
139
|
+
return (result_text, tags)
|
|
140
|
+
|
|
141
|
+
def read_tags(self, root) -> Dict[str, str]:
|
|
142
|
+
"""Reads the tags from an XML file and returns a dictionary of tags"""
|
|
143
|
+
tags_dict = {}
|
|
144
|
+
for tag in root.iter("TAGS"):
|
|
145
|
+
for subtag in tag:
|
|
146
|
+
met_value = subtag.attrib.get("met")
|
|
147
|
+
if self.is_convert_to_numbers:
|
|
148
|
+
met_value = 1 if met_value == "met" else 0
|
|
149
|
+
tags_dict[subtag.tag] = met_value
|
|
150
|
+
return tags_dict
|
|
151
|
+
|
|
152
|
+
def split_text(self, text: str) -> List[str]:
|
|
153
|
+
split_char = "*" * 100
|
|
154
|
+
parts = [x.strip() for x in text.split(split_char) if x.strip() != ""]
|
|
155
|
+
return parts
|
|
156
|
+
|
|
157
|
+
def remove_excessive_newlines(self, text: str) -> str:
|
|
158
|
+
text = text.replace("\n\n\n", "\n")
|
|
159
|
+
return text
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class N2C2CTMatchingScenario(Scenario):
|
|
163
|
+
"""
|
|
164
|
+
From "Cohort selection for clinical trials: n2c2 2018 shared task track 1" (Stubbs et al. 2019).
|
|
165
|
+
N2C2 is a collection of 288 patients (202 train / 86 test), each with 2-5 deidentified real-world clinical notes.
|
|
166
|
+
We use the prompt LLM formulation from Wornow et al. (2024).
|
|
167
|
+
|
|
168
|
+
Citation
|
|
169
|
+
```
|
|
170
|
+
@article{stubbs2019cohort,
|
|
171
|
+
title={Cohort selection for clinical trials: n2c2 2018 shared task track 1},
|
|
172
|
+
author={Stubbs, Amber and Filannino, Michele and Soysal, Ergin and Henry, Samuel and Uzuner, {\"O}zlem},
|
|
173
|
+
journal={Journal of the American Medical Informatics Association},
|
|
174
|
+
volume={26},
|
|
175
|
+
number={11},
|
|
176
|
+
pages={1163--1171},
|
|
177
|
+
year={2019},
|
|
178
|
+
publisher={Oxford University Press}
|
|
179
|
+
}
|
|
180
|
+
@article{wornow2024zero,
|
|
181
|
+
title={Zero-shot clinical trial patient matching with llms},
|
|
182
|
+
author={Wornow, Michael and Lozano, Alejandro and Dash, Dev and Jindal, Jenelle and Mahaffey, \
|
|
183
|
+
Kenneth W and Shah, Nigam H},
|
|
184
|
+
journal={NEJM AI},
|
|
185
|
+
pages={AIcs2400360},
|
|
186
|
+
year={2024},
|
|
187
|
+
publisher={Massachusetts Medical Society}
|
|
188
|
+
}
|
|
189
|
+
```
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
name = "n2c2_ct_matching"
|
|
193
|
+
description = (
|
|
194
|
+
"N2C2-CT is a benchmark designed to evaluate a model's ability to match patients to"
|
|
195
|
+
"appropriate clinical trials based on eligibility criteria. Each example includes a clinical"
|
|
196
|
+
"note and a trial description. The model is tasked with determining whether the patient"
|
|
197
|
+
"is a valid candidate for the trial. This benchmark supports automation and decision"
|
|
198
|
+
"support in clinical research enrollment."
|
|
199
|
+
)
|
|
200
|
+
tags = [] # TODO
|
|
201
|
+
|
|
202
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = [
|
|
203
|
+
"yes",
|
|
204
|
+
"no",
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
def __init__(self, data_path: str, subject: str):
|
|
208
|
+
super().__init__()
|
|
209
|
+
self.subject: str = subject # specific inclusion criterion to assess
|
|
210
|
+
self.data_path: str = data_path
|
|
211
|
+
self.path_to_train_dir: str = os.path.join(self.data_path, "train/")
|
|
212
|
+
self.path_to_test_dir: str = os.path.join(self.data_path, "test/")
|
|
213
|
+
|
|
214
|
+
def create_prompt(self, patient: Dict[str, Any]) -> str:
|
|
215
|
+
# Cast None values to empty strings during string formatting, but keep the original functions returning None
|
|
216
|
+
notes_list = [
|
|
217
|
+
f"## Note #{i+1}\nDate: {XMLDataLoader.get_date_of_note(patient, i) or ''}\n{note}"
|
|
218
|
+
for i, note in enumerate(patient["ehr"])
|
|
219
|
+
]
|
|
220
|
+
notes: str = ("\n" + "*" * 50 + "\n\n").join(notes_list)
|
|
221
|
+
current_date = XMLDataLoader.get_current_date_for_patient(patient)
|
|
222
|
+
prompt = f"""
|
|
223
|
+
# Task
|
|
224
|
+
Your job is to decide whether the given patient meets the inclusion criterion for a clinical trial.
|
|
225
|
+
|
|
226
|
+
# Inclusion Criterion
|
|
227
|
+
The inclusion criterion being assessed is: "{self.subject}".
|
|
228
|
+
The definition of the inclusion criterion is: "{LONG_DEFINITIONS[self.subject]}".
|
|
229
|
+
|
|
230
|
+
# Patient Clinical Notes
|
|
231
|
+
Below is a set of {len(patient['ehr'])} clinical notes describing the patient's current health status. \
|
|
232
|
+
Each note is separated by a header with the date that the note was written, as well as a long list of asterisks.
|
|
233
|
+
|
|
234
|
+
{'-' * 100}
|
|
235
|
+
|
|
236
|
+
{notes}
|
|
237
|
+
|
|
238
|
+
{'-' * 100}
|
|
239
|
+
|
|
240
|
+
# Current Date
|
|
241
|
+
Assume that the current date is: {current_date}
|
|
242
|
+
|
|
243
|
+
# Question
|
|
244
|
+
Does the patient meet the inclusion criterion "{self.subject}"?
|
|
245
|
+
"""
|
|
246
|
+
return prompt
|
|
247
|
+
|
|
248
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
249
|
+
instances: List[Instance] = []
|
|
250
|
+
for split in ["train", "test"]:
|
|
251
|
+
# limit to zero shot setting
|
|
252
|
+
if split == "test":
|
|
253
|
+
path_to_data = self.path_to_train_dir if split == "train" else self.path_to_test_dir
|
|
254
|
+
ensure_directory_exists(path_to_data)
|
|
255
|
+
|
|
256
|
+
# Load dataset
|
|
257
|
+
dataloader = XMLDataLoader(path_to_data)
|
|
258
|
+
dataset = dataloader.load_data()
|
|
259
|
+
|
|
260
|
+
# Create instances
|
|
261
|
+
for patient in dataset:
|
|
262
|
+
is_met: bool = patient["labels"][self.subject]
|
|
263
|
+
correct_answer: str = "yes" if is_met else "no"
|
|
264
|
+
|
|
265
|
+
# Build `References. The possible answer choices are "yes" or "no"
|
|
266
|
+
references: List[Reference] = [
|
|
267
|
+
Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
268
|
+
for answer in N2C2CTMatchingScenario.POSSIBLE_ANSWER_CHOICES
|
|
269
|
+
]
|
|
270
|
+
|
|
271
|
+
instances.append(
|
|
272
|
+
Instance(
|
|
273
|
+
input=Input(text=self.create_prompt(patient)),
|
|
274
|
+
references=references,
|
|
275
|
+
split=TRAIN_SPLIT if split == "train" else TEST_SPLIT,
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return instances
|
|
280
|
+
|
|
281
|
+
def get_metadata(self):
|
|
282
|
+
return ScenarioMetadata(
|
|
283
|
+
name="n2c2_ct_matching",
|
|
284
|
+
display_name="N2C2-CT Matching",
|
|
285
|
+
short_display_name="N2C2-CT",
|
|
286
|
+
description="A dataset that provides clinical notes and asks the model to classify whether "
|
|
287
|
+
"the patient is a valid candidate for a provided clinical trial.",
|
|
288
|
+
taxonomy=TaxonomyInfo(
|
|
289
|
+
task="Classification",
|
|
290
|
+
what="Classify whether a patient is a valid candidate for a clinical " "trial based on clinical notes",
|
|
291
|
+
when="Pre-Trial",
|
|
292
|
+
who="Researcher",
|
|
293
|
+
language="English",
|
|
294
|
+
),
|
|
295
|
+
main_metric="exact_match",
|
|
296
|
+
main_split="test",
|
|
297
|
+
)
|
|
@@ -3,8 +3,9 @@ import random
|
|
|
3
3
|
import csv
|
|
4
4
|
from typing import List, Dict
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
|
-
from .scenario import (
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
9
10
|
Instance,
|
|
10
11
|
Reference,
|
|
@@ -13,6 +14,7 @@ from .scenario import (
|
|
|
13
14
|
Input,
|
|
14
15
|
PassageQuestionInput,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -155,3 +157,20 @@ class NarrativeQAScenario(Scenario):
|
|
|
155
157
|
instances.extend(self.get_split_instances(summaries_file=summaries_file, qaps_file=qaps_file, split=split))
|
|
156
158
|
|
|
157
159
|
return instances
|
|
160
|
+
|
|
161
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
162
|
+
return ScenarioMetadata(
|
|
163
|
+
name="narrative_qa",
|
|
164
|
+
display_name="NarrativeQA",
|
|
165
|
+
description="The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský "
|
|
166
|
+
"et al., 2017)](https://aclanthology.org/Q18-1023/).",
|
|
167
|
+
taxonomy=TaxonomyInfo(
|
|
168
|
+
task="short-answer question answering",
|
|
169
|
+
what="passages are books and movie scripts, questions are unknown",
|
|
170
|
+
when="2018",
|
|
171
|
+
who="annotators from summaries",
|
|
172
|
+
language="English",
|
|
173
|
+
),
|
|
174
|
+
main_metric="f1_score",
|
|
175
|
+
main_split="test",
|
|
176
|
+
)
|
|
@@ -6,11 +6,12 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
import html
|
|
8
8
|
import random
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.common.hierarchical_logger import htrack_block, hlog
|
|
10
11
|
from typing import List, Dict
|
|
11
12
|
|
|
12
13
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists, asdict_without_nones
|
|
13
|
-
from .scenario import (
|
|
14
|
+
from helm.benchmark.scenarios.scenario import (
|
|
14
15
|
Scenario,
|
|
15
16
|
Instance,
|
|
16
17
|
Reference,
|
|
@@ -20,6 +21,7 @@ from .scenario import (
|
|
|
20
21
|
PassageQuestionInput,
|
|
21
22
|
Input,
|
|
22
23
|
Output,
|
|
24
|
+
ScenarioMetadata,
|
|
23
25
|
)
|
|
24
26
|
|
|
25
27
|
|
|
@@ -324,3 +326,33 @@ class NaturalQAScenario(Scenario):
|
|
|
324
326
|
instances.extend(self.get_file_instances(target_path))
|
|
325
327
|
|
|
326
328
|
return instances
|
|
329
|
+
|
|
330
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
331
|
+
if self.context_mode == "closedbook":
|
|
332
|
+
name = "natural_qa_closedbook"
|
|
333
|
+
display_name = "NaturalQuestions (closed-book)"
|
|
334
|
+
elif self.context_mode == "openbook_longans":
|
|
335
|
+
name = "natural_qa_openbook_longans"
|
|
336
|
+
display_name = "NaturalQuestions (open-book)"
|
|
337
|
+
elif self.context_mode == "openbook_wiki":
|
|
338
|
+
name = "natural_qa_openbook_wiki"
|
|
339
|
+
display_name = "NaturalQuestions (open-book Wiki)"
|
|
340
|
+
else:
|
|
341
|
+
raise Exception(f"Unknown context_mode {self.context_mode}")
|
|
342
|
+
return ScenarioMetadata(
|
|
343
|
+
name=name,
|
|
344
|
+
display_name=display_name,
|
|
345
|
+
description="The NaturalQuestions [(Kwiatkowski et al., "
|
|
346
|
+
"2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering "
|
|
347
|
+
"based on naturally-occurring queries through Google Search. The input does not "
|
|
348
|
+
"include the Wikipedia page with the answer.",
|
|
349
|
+
taxonomy=TaxonomyInfo(
|
|
350
|
+
task="question answering",
|
|
351
|
+
what="passages from Wikipedia, questions from search queries",
|
|
352
|
+
when="2010s",
|
|
353
|
+
who="web users",
|
|
354
|
+
language="English",
|
|
355
|
+
),
|
|
356
|
+
main_metric="f1_score",
|
|
357
|
+
main_split="valid",
|
|
358
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import List, Any
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OABExamsScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
The OAB Exam is a mandatory test for anyone who wants to practice law in Brazil. The exam is composed for
|
|
19
|
+
an objective test with 80 multiple-choice questions covering all areas of Law and a written phase focused
|
|
20
|
+
on a specific legal area (e.g., Civil, Criminal, Labor Law), where candidates must draft a legal document
|
|
21
|
+
and answer four essay questions.
|
|
22
|
+
|
|
23
|
+
This dataset is composed by the exams that occured between 2010 and 2018.
|
|
24
|
+
|
|
25
|
+
The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/oab_exams
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "oab_exams"
|
|
29
|
+
description = "OAB exams dataset"
|
|
30
|
+
tags = ["knowledge", "multiple_choice", "pt-br"]
|
|
31
|
+
|
|
32
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
33
|
+
# Download the raw data and read all the dialogues
|
|
34
|
+
dataset: Any
|
|
35
|
+
# Read all the instances
|
|
36
|
+
instances: List[Instance] = []
|
|
37
|
+
cache_dir = str(Path(output_path) / "data")
|
|
38
|
+
|
|
39
|
+
dataset = load_dataset("eduagarcia/oab_exams", cache_dir=cache_dir)
|
|
40
|
+
for example in dataset["train"]:
|
|
41
|
+
question = example["question"]
|
|
42
|
+
choices = example["choices"]
|
|
43
|
+
answer = example["answerKey"]
|
|
44
|
+
# Skipping every canceled question!
|
|
45
|
+
if example["nullified"]:
|
|
46
|
+
continue
|
|
47
|
+
answers_dict = dict(zip(choices["label"], choices["text"]))
|
|
48
|
+
correct_answer = answers_dict[answer]
|
|
49
|
+
|
|
50
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
51
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
52
|
+
|
|
53
|
+
instance = Instance(
|
|
54
|
+
input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
|
|
55
|
+
)
|
|
56
|
+
instances.append(instance)
|
|
57
|
+
return instances
|