crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -2,8 +2,18 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
Reference,
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Input,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
class SelfInstructScenario(Scenario):
|
|
@@ -46,3 +56,21 @@ class SelfInstructScenario(Scenario):
|
|
|
46
56
|
)
|
|
47
57
|
instances.append(instance)
|
|
48
58
|
return instances
|
|
59
|
+
|
|
60
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
61
|
+
return ScenarioMetadata(
|
|
62
|
+
name="self_instruct",
|
|
63
|
+
display_name="Self Instruct",
|
|
64
|
+
short_display_name="Self Instruct",
|
|
65
|
+
description="The manually-curated instructions from the Self-Instruct paper ([Wang et al., "
|
|
66
|
+
"2023](https://aclanthology.org/2023.acl-long.754.pdf)).",
|
|
67
|
+
taxonomy=TaxonomyInfo(
|
|
68
|
+
task="open-ended instruction following",
|
|
69
|
+
what="Instructions for LLMs",
|
|
70
|
+
when="2022",
|
|
71
|
+
who="Authors of the research paper",
|
|
72
|
+
language="English",
|
|
73
|
+
),
|
|
74
|
+
main_metric="Helpfulness",
|
|
75
|
+
main_split="test",
|
|
76
|
+
)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import check_file_exists
|
|
17
|
+
|
|
18
|
+
csv.field_size_limit(sys.maxsize)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SHCBMTMedScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
This benchmark dataset was built from a patient status gold-standard
|
|
24
|
+
for specific questions asked after a bone marrow transplant has taken place.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "shc_bmt_med"
|
|
28
|
+
description = (
|
|
29
|
+
"BMT-Status is a benchmark composed of clinical notes and associated binary questions"
|
|
30
|
+
"related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT),"
|
|
31
|
+
"or hematopoietic cell transplant (HCT) status. The goal is to determine whether the"
|
|
32
|
+
"patient received a subsequent transplant based on the provided clinical documentation."
|
|
33
|
+
)
|
|
34
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
35
|
+
|
|
36
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
37
|
+
|
|
38
|
+
def __init__(self, data_path: str):
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.data_path = data_path
|
|
41
|
+
|
|
42
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
43
|
+
data = {}
|
|
44
|
+
with open(csv_path, "r") as file:
|
|
45
|
+
reader = csv.DictReader(file)
|
|
46
|
+
for row in reader:
|
|
47
|
+
question = row["prompt"]
|
|
48
|
+
context = row["context"]
|
|
49
|
+
answer = row["label"]
|
|
50
|
+
prompt = (
|
|
51
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
52
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
53
|
+
"details or response, just a simple A or B response."
|
|
54
|
+
)
|
|
55
|
+
data[prompt] = answer
|
|
56
|
+
return data
|
|
57
|
+
|
|
58
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
59
|
+
check_file_exists(self.data_path, msg=f"[SHCBMTMedScenario] Required data file not found: '{self.data_path}'")
|
|
60
|
+
instances: List[Instance] = []
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
62
|
+
|
|
63
|
+
for prompt, answer in benchmark_data.items():
|
|
64
|
+
assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
65
|
+
references: List[Reference] = [
|
|
66
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
67
|
+
for pred_answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
68
|
+
]
|
|
69
|
+
instances.append(
|
|
70
|
+
Instance(
|
|
71
|
+
input=Input(text=prompt),
|
|
72
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
73
|
+
split=TEST_SPLIT,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return instances
|
|
78
|
+
|
|
79
|
+
def get_metadata(self):
|
|
80
|
+
return ScenarioMetadata(
|
|
81
|
+
name="shc_bmt_med",
|
|
82
|
+
display_name="BMT-Status",
|
|
83
|
+
description="BMT-Status is a benchmark composed of clinical notes and associated binary "
|
|
84
|
+
"questions related to bone marrow transplant (BMT), hematopoietic stem cell "
|
|
85
|
+
"transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is "
|
|
86
|
+
"to determine whether the patient received a subsequent transplant based on the "
|
|
87
|
+
"provided clinical documentation.",
|
|
88
|
+
taxonomy=TaxonomyInfo(
|
|
89
|
+
task="question answering",
|
|
90
|
+
what="Answer bone marrow transplant questions",
|
|
91
|
+
when="Any",
|
|
92
|
+
who="Researcher",
|
|
93
|
+
language="English",
|
|
94
|
+
),
|
|
95
|
+
main_metric="exact_match",
|
|
96
|
+
main_split="test",
|
|
97
|
+
)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import check_file_exists
|
|
17
|
+
|
|
18
|
+
csv.field_size_limit(sys.maxsize)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SHCCDIMedScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
This benchmark dataset was built from Clinical Document Integrity (CDI)
|
|
24
|
+
notes were there are verifications of clinical activities. The idea behind
|
|
25
|
+
it was to assess an LLM capability to answer these questions from previous notes.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "shc_cdi_med"
|
|
29
|
+
description = (
|
|
30
|
+
"CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI)"
|
|
31
|
+
"notes. It is used to evaluate a model's ability to verify clinical conditions based on"
|
|
32
|
+
"documented evidence in patient records."
|
|
33
|
+
)
|
|
34
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
35
|
+
|
|
36
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
37
|
+
|
|
38
|
+
def __init__(self, data_path: str):
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.data_path = data_path
|
|
41
|
+
|
|
42
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
43
|
+
data = {}
|
|
44
|
+
with open(csv_path, "r") as file:
|
|
45
|
+
reader = csv.DictReader(file)
|
|
46
|
+
for row in reader:
|
|
47
|
+
question = row["prompt"]
|
|
48
|
+
context = row["context"]
|
|
49
|
+
answer = row["label"]
|
|
50
|
+
prompt = (
|
|
51
|
+
f"Provide an answer to the following question: {question} with the following context: {context} , "
|
|
52
|
+
"Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
|
|
53
|
+
"additional details or response, just a simple A or B response."
|
|
54
|
+
)
|
|
55
|
+
data[prompt] = answer
|
|
56
|
+
return data
|
|
57
|
+
|
|
58
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
59
|
+
check_file_exists(self.data_path, msg=f"[SHCCDIMedScenario] Required data file not found: '{self.data_path}'")
|
|
60
|
+
instances: List[Instance] = []
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
62
|
+
|
|
63
|
+
for prompt, answer in benchmark_data.items():
|
|
64
|
+
assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
65
|
+
references: List[Reference] = [
|
|
66
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
67
|
+
for pred_answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
68
|
+
]
|
|
69
|
+
instances.append(
|
|
70
|
+
Instance(
|
|
71
|
+
input=Input(text=prompt),
|
|
72
|
+
references=references,
|
|
73
|
+
split=TEST_SPLIT,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return instances
|
|
78
|
+
|
|
79
|
+
def get_metadata(self):
|
|
80
|
+
return ScenarioMetadata(
|
|
81
|
+
name="shc_cdi_med",
|
|
82
|
+
display_name="CDI-QA",
|
|
83
|
+
description="CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) "
|
|
84
|
+
"notes. It is used to evaluate a model's ability to verify clinical conditions "
|
|
85
|
+
"based on documented evidence in patient records.",
|
|
86
|
+
taxonomy=TaxonomyInfo(
|
|
87
|
+
task="Classification",
|
|
88
|
+
what="Answer verification questions from CDI notes",
|
|
89
|
+
when="Any",
|
|
90
|
+
who="Hospital Admistrator",
|
|
91
|
+
language="English",
|
|
92
|
+
),
|
|
93
|
+
main_metric="exact_match",
|
|
94
|
+
main_split="test",
|
|
95
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import check_file_exists
|
|
17
|
+
|
|
18
|
+
csv.field_size_limit(sys.maxsize)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SHCCONFMedScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
Benchmark derived from extracting confidential information from clinical notes.
|
|
24
|
+
From Evaluation of a Large Language Model to Identify Confidential Content in
|
|
25
|
+
Adolescent Encounter Notes published at https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "shc_conf_med"
|
|
29
|
+
description = (
|
|
30
|
+
"MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is"
|
|
31
|
+
"used to evaluate whether the content contains sensitive protected health information"
|
|
32
|
+
"(PHI) that should be restricted from parental access, in accordance with adolescent"
|
|
33
|
+
"confidentiality policies in clinical care."
|
|
34
|
+
)
|
|
35
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
36
|
+
|
|
37
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
38
|
+
|
|
39
|
+
def __init__(self, data_path: str):
|
|
40
|
+
super().__init__()
|
|
41
|
+
self.data_path = data_path
|
|
42
|
+
|
|
43
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
44
|
+
data = {}
|
|
45
|
+
with open(csv_path, "r") as file:
|
|
46
|
+
reader = csv.DictReader(file)
|
|
47
|
+
for row in reader:
|
|
48
|
+
question = row["prompt"]
|
|
49
|
+
context = row["context"]
|
|
50
|
+
answer = row["label"]
|
|
51
|
+
prompt = (
|
|
52
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
53
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
54
|
+
"details or response, just a simple A or B response."
|
|
55
|
+
)
|
|
56
|
+
data[prompt] = answer
|
|
57
|
+
return data
|
|
58
|
+
|
|
59
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
60
|
+
check_file_exists(self.data_path, msg=f"[SHCCONFMedScenario] Required data file not found: '{self.data_path}'")
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
63
|
+
|
|
64
|
+
for prompt, answer in benchmark_data.items():
|
|
65
|
+
assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
66
|
+
references: List[Reference] = [
|
|
67
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
68
|
+
for pred_answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
69
|
+
]
|
|
70
|
+
instances.append(
|
|
71
|
+
Instance(
|
|
72
|
+
input=Input(text=prompt),
|
|
73
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
74
|
+
split=TEST_SPLIT,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return instances
|
|
79
|
+
|
|
80
|
+
def get_metadata(self):
|
|
81
|
+
return ScenarioMetadata(
|
|
82
|
+
name="shc_conf_med",
|
|
83
|
+
display_name="MedConfInfo",
|
|
84
|
+
description="MedConfInfo is a benchmark comprising clinical notes from adolescent patients. "
|
|
85
|
+
"It is used to evaluate whether the content contains sensitive protected health "
|
|
86
|
+
"information (PHI) that should be restricted from parental access, in "
|
|
87
|
+
"accordance with adolescent confidentiality policies in clinical care. "
|
|
88
|
+
"[(Rabbani et al., "
|
|
89
|
+
"2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).",
|
|
90
|
+
taxonomy=TaxonomyInfo(
|
|
91
|
+
task="Classification",
|
|
92
|
+
what="Identify sensitive health info in adolescent notes",
|
|
93
|
+
when="Any",
|
|
94
|
+
who="Clinician",
|
|
95
|
+
language="English",
|
|
96
|
+
),
|
|
97
|
+
main_metric="exact_match",
|
|
98
|
+
main_split="test",
|
|
99
|
+
)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import check_file_exists
|
|
17
|
+
|
|
18
|
+
csv.field_size_limit(sys.maxsize)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SHCENTMedScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
This benchmark dataset was built to assess the capabilities "
|
|
24
|
+
"of an LLM for referral to the Ear, Nose and Throat department.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "shc_ent_med"
|
|
28
|
+
description = (
|
|
29
|
+
"ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note"
|
|
30
|
+
"supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess"
|
|
31
|
+
"models' abilities to make referral decisions based on unstructured clinical text."
|
|
32
|
+
)
|
|
33
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
34
|
+
|
|
35
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
|
|
36
|
+
|
|
37
|
+
def __init__(self, data_path: str):
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.data_path = data_path
|
|
40
|
+
|
|
41
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
42
|
+
data = {}
|
|
43
|
+
counter = 1
|
|
44
|
+
with open(csv_path, "r") as file:
|
|
45
|
+
reader = csv.DictReader(file)
|
|
46
|
+
for row in reader:
|
|
47
|
+
if row["label"] != "": # skip rows with character/encoding issues - 79
|
|
48
|
+
question = row["prompt"]
|
|
49
|
+
context = row["context"]
|
|
50
|
+
answer = row["label"]
|
|
51
|
+
prompt = (
|
|
52
|
+
f"{counter} Provide an answer to the following question: {question} with the following context:"
|
|
53
|
+
f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
|
|
54
|
+
" Do not provide any additional details or response, just a simple A, B, or C response."
|
|
55
|
+
)
|
|
56
|
+
data[prompt] = answer
|
|
57
|
+
counter = counter + 1
|
|
58
|
+
return data
|
|
59
|
+
|
|
60
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
61
|
+
check_file_exists(self.data_path, msg=f"[SHCENTMedScenario] Required data file not found: '{self.data_path}'")
|
|
62
|
+
instances: List[Instance] = []
|
|
63
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
64
|
+
|
|
65
|
+
for prompt, answer in benchmark_data.items():
|
|
66
|
+
assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
67
|
+
references: List[Reference] = [
|
|
68
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
69
|
+
for pred_answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
70
|
+
]
|
|
71
|
+
instances.append(
|
|
72
|
+
Instance(
|
|
73
|
+
input=Input(text=prompt),
|
|
74
|
+
references=references,
|
|
75
|
+
split=TEST_SPLIT,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
return instances
|
|
80
|
+
|
|
81
|
+
def get_metadata(self):
|
|
82
|
+
return ScenarioMetadata(
|
|
83
|
+
name="shc_ent_med",
|
|
84
|
+
display_name="ENT-Referral",
|
|
85
|
+
description="ENT-Referral is a benchmark designed to evaluate whether a patient's clinical "
|
|
86
|
+
"note supports a referral to an Ear, Nose, and Throat (ENT) specialist. It "
|
|
87
|
+
"helps assess models' abilities to make referral decisions based on "
|
|
88
|
+
"unstructured clinical text",
|
|
89
|
+
taxonomy=TaxonomyInfo(
|
|
90
|
+
task="Classification",
|
|
91
|
+
what="Identify referrals for ENT specialists",
|
|
92
|
+
when="Any",
|
|
93
|
+
who="Hospital Admistrator",
|
|
94
|
+
language="English",
|
|
95
|
+
),
|
|
96
|
+
main_metric="exact_match",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import check_file_exists
|
|
17
|
+
|
|
18
|
+
csv.field_size_limit(sys.maxsize)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SHCGIPMedScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
This benchmark dataset was built from a patient referral gold-standard set
|
|
24
|
+
to a specialty clinic to verify the ability of LLMs for patient hospice referral purposes.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "shc_gip_med"
|
|
28
|
+
description = (
|
|
29
|
+
"HospiceReferral is a benchmark that evaluates model performance in identifying"
|
|
30
|
+
"whether patients are eligible for hospice care based on palliative care clinical notes."
|
|
31
|
+
"The benchmark focuses on end-of-life care referral decisions."
|
|
32
|
+
)
|
|
33
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
34
|
+
|
|
35
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
36
|
+
|
|
37
|
+
def __init__(self, data_path: str):
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.data_path = data_path
|
|
40
|
+
|
|
41
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
42
|
+
data = {}
|
|
43
|
+
with open(csv_path, "r") as file:
|
|
44
|
+
reader = csv.DictReader(file)
|
|
45
|
+
for row in reader:
|
|
46
|
+
question = row["prompt"]
|
|
47
|
+
context = row["context"]
|
|
48
|
+
answer = row["label"]
|
|
49
|
+
prompt = (
|
|
50
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
51
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
52
|
+
"details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(self.data_path, msg=f"[SHCGIPMedScenario] Required data file not found: '{self.data_path}'")
|
|
59
|
+
instances: List[Instance] = []
|
|
60
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
61
|
+
|
|
62
|
+
for prompt, answer in benchmark_data.items():
|
|
63
|
+
assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
64
|
+
references: List[Reference] = [
|
|
65
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
66
|
+
for pred_answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
67
|
+
]
|
|
68
|
+
instances.append(
|
|
69
|
+
Instance(
|
|
70
|
+
input=Input(text=prompt),
|
|
71
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
72
|
+
split=TEST_SPLIT,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return instances
|
|
77
|
+
|
|
78
|
+
def get_metadata(self):
|
|
79
|
+
return ScenarioMetadata(
|
|
80
|
+
name="shc_gip_med",
|
|
81
|
+
display_name="HospiceReferral",
|
|
82
|
+
description="HospiceReferral is a benchmark that evaluates model performance in identifying "
|
|
83
|
+
"whether patients are eligible for hospice care based on palliative care "
|
|
84
|
+
"clinical notes. The benchmark focuses on end-of-life care referral decisions.",
|
|
85
|
+
taxonomy=TaxonomyInfo(
|
|
86
|
+
task="Classification",
|
|
87
|
+
what="Assess hospice referral appropriateness",
|
|
88
|
+
when="End-of-care",
|
|
89
|
+
who="Hospital Admistrator",
|
|
90
|
+
language="English",
|
|
91
|
+
),
|
|
92
|
+
main_metric="exact_match",
|
|
93
|
+
main_split="test",
|
|
94
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import check_file_exists
|
|
17
|
+
|
|
18
|
+
csv.field_size_limit(sys.maxsize)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SHCPRIVACYMedScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
This dataset features messages sent generated by an LLM from patient clinical notes data.
|
|
24
|
+
The scenario evaluates the ability of an LLM to determine if any potentially confidential
|
|
25
|
+
information about the patient was included. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "shc_privacy_med"
|
|
29
|
+
description = (
|
|
30
|
+
"PrivacyDetection is a benchmark composed of patient portal messages submitted by"
|
|
31
|
+
"patients or caregivers. The task is to determine whether the message contains any"
|
|
32
|
+
"confidential or privacy-leaking information that should be protected."
|
|
33
|
+
)
|
|
34
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
35
|
+
|
|
36
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
37
|
+
|
|
38
|
+
def __init__(self, data_path: str):
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.data_path = data_path
|
|
41
|
+
|
|
42
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
43
|
+
data = {}
|
|
44
|
+
with open(csv_path, "r") as file:
|
|
45
|
+
reader = csv.DictReader(file)
|
|
46
|
+
for row in reader:
|
|
47
|
+
question = row["prompt"]
|
|
48
|
+
context = row["context"]
|
|
49
|
+
answer = row["label"]
|
|
50
|
+
prompt = (
|
|
51
|
+
"You are reviewing clinical messages in order to determine if any confidential "
|
|
52
|
+
f"information was included. Please answer the following question: {question} with the "
|
|
53
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
54
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
55
|
+
)
|
|
56
|
+
data[prompt] = answer
|
|
57
|
+
return data
|
|
58
|
+
|
|
59
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
60
|
+
check_file_exists(
|
|
61
|
+
self.data_path, msg=f"[SHCPRIVACYMedScenario] Required data file not found: '{self.data_path}'"
|
|
62
|
+
)
|
|
63
|
+
instances: List[Instance] = []
|
|
64
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
65
|
+
|
|
66
|
+
for prompt, answer in benchmark_data.items():
|
|
67
|
+
assert answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
68
|
+
references: List[Reference] = [
|
|
69
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
70
|
+
for pred_answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
71
|
+
]
|
|
72
|
+
instances.append(
|
|
73
|
+
Instance(
|
|
74
|
+
input=Input(text=prompt),
|
|
75
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
76
|
+
split=TEST_SPLIT,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return instances
|
|
81
|
+
|
|
82
|
+
def get_metadata(self):
|
|
83
|
+
return ScenarioMetadata(
|
|
84
|
+
name="shc_privacy_med",
|
|
85
|
+
display_name="PrivacyDetection",
|
|
86
|
+
description="PrivacyDetection is a benchmark composed of patient portal messages submitted "
|
|
87
|
+
"by patients or caregivers. The task is to determine whether the message "
|
|
88
|
+
"contains any confidential or privacy-leaking information that should be "
|
|
89
|
+
"protected [(Tse G, et al., "
|
|
90
|
+
"2025)](https://doi.org/10.1001/jamapediatrics.2024.4438).",
|
|
91
|
+
taxonomy=TaxonomyInfo(
|
|
92
|
+
task="Classification",
|
|
93
|
+
what="Classify if a document leaks private information",
|
|
94
|
+
when="Any",
|
|
95
|
+
who="Clinician, Caregiver",
|
|
96
|
+
language="English",
|
|
97
|
+
),
|
|
98
|
+
main_metric="exact_match",
|
|
99
|
+
main_split="test",
|
|
100
|
+
)
|