crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from typing import List, Tuple, Optional, Dict, Any
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import threading
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.annotation.image2struct.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
|
|
8
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
9
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
10
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.driver import ScreenshotOptions
|
|
11
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage.utils import convert_html_to_text
|
|
12
|
+
from helm.benchmark.scenarios.vision_language.image2struct.webpage_scenario import serve_and_take_screenshot
|
|
13
|
+
from helm.benchmark.scenarios.scenario import ASSET_NAME_TAG, ASSET_PATH_TAG
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
from helm.common.cache import CacheConfig
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from PIL import Image
|
|
19
|
+
from html2text import HTML2Text
|
|
20
|
+
except ModuleNotFoundError as e:
|
|
21
|
+
handle_module_not_found_error(e, suggestions=["image2struct"])
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class WebpageCompilerAnnotator(ImageCompilerAnnotator):
|
|
25
|
+
"""Annotator that compiles the text completions into a webpage
|
|
26
|
+
And takes a screenshot of the webpage."""
|
|
27
|
+
|
|
28
|
+
name: str = "webpage_compiler"
|
|
29
|
+
|
|
30
|
+
# Delimiters for the code block
|
|
31
|
+
DELIMITERS: List[Tuple[str, str]] = [
|
|
32
|
+
("```json", "```"),
|
|
33
|
+
("```", "```"),
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
def __init__(self, cache_config: CacheConfig, file_storage_path: str):
|
|
37
|
+
super().__init__(cache_config, file_storage_path)
|
|
38
|
+
self._html2text = HTML2Text()
|
|
39
|
+
self._html2text.ignore_links = True
|
|
40
|
+
|
|
41
|
+
def postprocess_infos(self, infos: Dict[str, Any]) -> Dict[str, Any]:
|
|
42
|
+
"""Postprocess the infos."""
|
|
43
|
+
annotations = super().postprocess_infos(infos)
|
|
44
|
+
assert "html" in annotations, "The html field should be present in the infos"
|
|
45
|
+
annotations["text"] = convert_html_to_text(self._html2text, infos["html"])
|
|
46
|
+
return annotations
|
|
47
|
+
|
|
48
|
+
def compile_completion_into_image(
|
|
49
|
+
self, request_state: RequestState, completion_text: str
|
|
50
|
+
) -> Tuple[Image.Image, Dict[str, Any]]:
|
|
51
|
+
"""Given a completion, parse the code and compile it into an image and return the image and the infos."""
|
|
52
|
+
# Create a temporary directory to store the files
|
|
53
|
+
cache_config: CacheConfig = self._cache.config
|
|
54
|
+
repo_path: str = "prod_env/tmp"
|
|
55
|
+
if hasattr(cache_config, "path"):
|
|
56
|
+
repo_path = os.path.join(os.path.dirname(cache_config.path), "tmp")
|
|
57
|
+
# Make the repo path thread safe by adding the thread id
|
|
58
|
+
repo_path = f"{repo_path}_{threading.get_ident()}"
|
|
59
|
+
ensure_directory_exists(repo_path)
|
|
60
|
+
|
|
61
|
+
# Check for code block delimiters
|
|
62
|
+
# After this completion should be a valid json object
|
|
63
|
+
for start, end in self.DELIMITERS:
|
|
64
|
+
if start in completion_text and end in completion_text[completion_text.index(start) + len(start) :]:
|
|
65
|
+
start_index = completion_text.index(start) + len(start)
|
|
66
|
+
end_index = completion_text.index(end, start_index)
|
|
67
|
+
completion_text = completion_text[start_index:end_index]
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
# Parse code into json object
|
|
71
|
+
structure: dict
|
|
72
|
+
try:
|
|
73
|
+
structure = json.loads(completion_text)
|
|
74
|
+
except json.JSONDecodeError as e:
|
|
75
|
+
raise CompilationError(f"Failed to parse the completion as a JSON object: {e}") from e
|
|
76
|
+
|
|
77
|
+
# Copy the assets
|
|
78
|
+
assets_paths: List[str] = []
|
|
79
|
+
assets_names: List[str] = []
|
|
80
|
+
for reference in request_state.instance.references:
|
|
81
|
+
if ASSET_PATH_TAG in reference.tags:
|
|
82
|
+
assert reference.output.multimedia_content is not None
|
|
83
|
+
for media_object in reference.output.multimedia_content.media_objects:
|
|
84
|
+
assert media_object.is_local_file
|
|
85
|
+
assert media_object.is_type("image")
|
|
86
|
+
assert type(media_object.location) == str
|
|
87
|
+
assets_paths.append(media_object.location)
|
|
88
|
+
if ASSET_NAME_TAG in reference.tags:
|
|
89
|
+
assert reference.output.multimedia_content is not None
|
|
90
|
+
for media_object in reference.output.multimedia_content.media_objects:
|
|
91
|
+
assert media_object.is_type("text")
|
|
92
|
+
assert type(media_object.text) == str
|
|
93
|
+
assets_names.append(media_object.text)
|
|
94
|
+
assert len(assets_paths) == len(assets_names)
|
|
95
|
+
for asset_path, asset_name in zip(assets_paths, assets_names):
|
|
96
|
+
dest_path: str = os.path.join(repo_path, asset_name)
|
|
97
|
+
# Make sure the parent directory exists
|
|
98
|
+
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
|
|
99
|
+
shutil.copyfile(asset_path, dest_path)
|
|
100
|
+
# os.symlink(asset_path, dest_path)
|
|
101
|
+
|
|
102
|
+
# Create each file in a temporary directory
|
|
103
|
+
if not isinstance(structure, list):
|
|
104
|
+
raise CompilationError("The completion should be a list of files")
|
|
105
|
+
for item in structure:
|
|
106
|
+
filename: Optional[str] = item.get("filename")
|
|
107
|
+
content: Optional[str] = item.get("content")
|
|
108
|
+
if filename is None or content is None:
|
|
109
|
+
raise CompilationError("Each file should have a valid filename and content")
|
|
110
|
+
# Create parent directories if they do not exist
|
|
111
|
+
if filename in assets_names:
|
|
112
|
+
# Some models will include assets in their response like this:
|
|
113
|
+
# {
|
|
114
|
+
# "filename": "chmber.jpg",
|
|
115
|
+
# "content": "The content of the chmber.jpg file is a binary image and cannot be displayed as text."
|
|
116
|
+
# }
|
|
117
|
+
# In this case, we skip the file creation
|
|
118
|
+
continue
|
|
119
|
+
parent_dir = os.path.join(repo_path, os.path.dirname(filename))
|
|
120
|
+
os.makedirs(parent_dir, exist_ok=True)
|
|
121
|
+
with open(os.path.join(repo_path, filename), "w") as f:
|
|
122
|
+
f.write(content)
|
|
123
|
+
|
|
124
|
+
# Save the screenshot, loads the image and remove the file
|
|
125
|
+
destination_path: str = os.path.join(repo_path, "output.png")
|
|
126
|
+
infos: Dict[str, Any] = serve_and_take_screenshot(repo_path, destination_path, ScreenshotOptions())
|
|
127
|
+
image: Image.Image = Image.open(destination_path)
|
|
128
|
+
|
|
129
|
+
# Delete the repository
|
|
130
|
+
shutil.rmtree(repo_path, ignore_errors=True)
|
|
131
|
+
|
|
132
|
+
return image, infos
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_directory_exists
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.annotation.annotator import Annotator
|
|
8
|
+
from helm.benchmark.annotation.model_as_judge import score_with_reasoning
|
|
9
|
+
from helm.clients.auto_client import AutoClient
|
|
10
|
+
|
|
11
|
+
PROMPT_TEMPLATE = """You are a medical judge will be given a user request and a model response to that request.
|
|
12
|
+
Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
|
|
13
|
+
The user's request will be provided in these tags:
|
|
14
|
+
<user_request>
|
|
15
|
+
{{QUESTION}}
|
|
16
|
+
</user_request>
|
|
17
|
+
The model's response will be provided in these tags:
|
|
18
|
+
<model_response>
|
|
19
|
+
{{ANSWER}}
|
|
20
|
+
</model_response>
|
|
21
|
+
Some potential correct responses will be provided in these tags:
|
|
22
|
+
<correct_responses>
|
|
23
|
+
{{CORRECT_RESPONSES}}
|
|
24
|
+
</correct_responses>
|
|
25
|
+
Carefully analyze the <model_response> contents.
|
|
26
|
+
Determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
|
|
27
|
+
# 1, if the model response fully matches the correct response content and intent for at least one correct response
|
|
28
|
+
# 0.7, if the model response contains correct information but is incomplete relative to the correct response(s)
|
|
29
|
+
# 0.3, if the model response contains some correct information but is incomplete and/or incorrect
|
|
30
|
+
# 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
|
|
31
|
+
Please output your one-sentence concise reasoning within the "reasoning" tags and your score within the "score" tags.
|
|
32
|
+
Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
|
|
33
|
+
Only output a tag-delimited object with the following format:
|
|
34
|
+
<reasoning>
|
|
35
|
+
INSERT_YOUR_REASONING_HERE
|
|
36
|
+
</reasoning>
|
|
37
|
+
<score>
|
|
38
|
+
INSERT_YOUR_SCORE_HERE
|
|
39
|
+
</score>
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class LiveQAAnnotator(Annotator):
|
|
44
|
+
"""The LiveQA autograder."""
|
|
45
|
+
|
|
46
|
+
name = "live_qa"
|
|
47
|
+
|
|
48
|
+
def __init__(self, auto_client: AutoClient, file_storage_path: str):
|
|
49
|
+
self._auto_client = auto_client
|
|
50
|
+
cache_dir = os.path.join(file_storage_path, "data")
|
|
51
|
+
ensure_directory_exists(cache_dir)
|
|
52
|
+
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
53
|
+
self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
54
|
+
|
|
55
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
56
|
+
assert request_state.result
|
|
57
|
+
assert len(request_state.result.completions) == 1
|
|
58
|
+
model_input_text = request_state.request.prompt
|
|
59
|
+
model_output_text = request_state.result.completions[0].text
|
|
60
|
+
if not model_output_text.strip():
|
|
61
|
+
return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
|
|
62
|
+
correct_responses = ""
|
|
63
|
+
for i, reference in enumerate(request_state.instance.references):
|
|
64
|
+
correct_responses += "Correct Response #" + str(i) + ": " + reference.output.text + "\n"
|
|
65
|
+
annotator_prompt = (
|
|
66
|
+
PROMPT_TEMPLATE.strip()
|
|
67
|
+
.replace("{{QUESTION}}", model_input_text)
|
|
68
|
+
.replace("{{ANSWER}}", model_output_text)
|
|
69
|
+
.replace("{{CORRECT_RESPONSES}}", correct_responses)
|
|
70
|
+
)
|
|
71
|
+
results = score_with_reasoning(
|
|
72
|
+
self._auto_client, annotator_prompt, "openai/gpt-4o-2024-05-13", "openai/gpt-4o-2024-05-13"
|
|
73
|
+
)
|
|
74
|
+
reasoning = results.get("reasoning")
|
|
75
|
+
score = results.get("score")
|
|
76
|
+
return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of medical dialogue summaries.
|
|
8
|
+
|
|
9
|
+
Your goal is to assess how well the summary captures the patient-doctor conversation,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The patient-doctor conversation will be provided in these tags:
|
|
13
|
+
<conversation>
|
|
14
|
+
{QUESTION}
|
|
15
|
+
</conversation>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{RESPONSE}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{GOLD_RESPONSE}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully review the <response> and compare it to the <gold_response> when needed.
|
|
28
|
+
|
|
29
|
+
Please evaluate the generated summary on a scale of 1-5 (1 = poor, 5 = excellent)
|
|
30
|
+
for each of these three key dimensions:
|
|
31
|
+
|
|
32
|
+
Evaluation Criteria:
|
|
33
|
+
Accuracy (1-5)
|
|
34
|
+
- Does the summary correctly capture the main medical issue and clinical details from the conversation?
|
|
35
|
+
|
|
36
|
+
Completeness (1-5)
|
|
37
|
+
- Does the summary include all important medical information from the conversation?
|
|
38
|
+
|
|
39
|
+
Clarity (1-5)
|
|
40
|
+
- Is the summary easy to understand for clinical use?
|
|
41
|
+
|
|
42
|
+
Output Format:
|
|
43
|
+
Output your evaluation as a single valid JSON object matching the following structure:
|
|
44
|
+
{
|
|
45
|
+
"accuracy": {
|
|
46
|
+
"score": 0,
|
|
47
|
+
"explanation": "Brief explanation of why this score was given."
|
|
48
|
+
},
|
|
49
|
+
"completeness": {
|
|
50
|
+
"score": 0,
|
|
51
|
+
"explanation": "Brief explanation of why this score was given."
|
|
52
|
+
},
|
|
53
|
+
"clarity": {
|
|
54
|
+
"score": 0,
|
|
55
|
+
"explanation": "Brief explanation of why this score was given."
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
Ensure the output is valid JSON:
|
|
60
|
+
- Use **double quotes** (") for all keys and string values.
|
|
61
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
62
|
+
maintain valid JSON formatting.
|
|
63
|
+
- Do not include any additional information in the output.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
67
|
+
"accuracy": {"score", "explanation"},
|
|
68
|
+
"completeness": {"score", "explanation"},
|
|
69
|
+
"clarity": {"score", "explanation"},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MedDialogAnnotator(LLMAsJuryAnnotator):
|
|
74
|
+
"""The MedDialog autograder."""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
auto_client: AutoClient,
|
|
79
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
80
|
+
template_name: Optional[str] = None,
|
|
81
|
+
):
|
|
82
|
+
super().__init__(
|
|
83
|
+
name="med_dialog",
|
|
84
|
+
auto_client=auto_client,
|
|
85
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
86
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
87
|
+
annotator_models=annotator_models,
|
|
88
|
+
)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert responsible for evaluating the quality of a
|
|
8
|
+
response to a clinical instruction, given an associated Electronic Health Record (EHR).
|
|
9
|
+
|
|
10
|
+
Your goal is to assess how well the response interprets the EHR, follows clinical best practices,
|
|
11
|
+
and aligns with the gold response in terms of accuracy, completeness, and clarity.
|
|
12
|
+
|
|
13
|
+
The instruction and EHR pair will be provided in these tags:
|
|
14
|
+
<user_request>
|
|
15
|
+
{QUESTION}
|
|
16
|
+
</user_request>
|
|
17
|
+
|
|
18
|
+
The response will be provided in these tags:
|
|
19
|
+
<response>
|
|
20
|
+
{RESPONSE}
|
|
21
|
+
</response>
|
|
22
|
+
|
|
23
|
+
The gold response (reference answer) will be provided in these tags:
|
|
24
|
+
<gold_response>
|
|
25
|
+
{GOLD_RESPONSE}
|
|
26
|
+
</gold_response>
|
|
27
|
+
|
|
28
|
+
Carefully review the <response> based on the <user_request> and compare it to
|
|
29
|
+
the <gold_response> when needed.
|
|
30
|
+
For each category below, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent)
|
|
31
|
+
and provide a brief explanation for your score:
|
|
32
|
+
|
|
33
|
+
Evaluation Criteria:
|
|
34
|
+
Accuracy (1-5)
|
|
35
|
+
- Does the response provide correct medical information based on the EHR and clinical guidelines?
|
|
36
|
+
|
|
37
|
+
Completeness (1-5)
|
|
38
|
+
- Does the response include all important patient information and address the clinical question?
|
|
39
|
+
|
|
40
|
+
Clarity (1-5)
|
|
41
|
+
- Is the response written clearly so clinicians can easily understand it?
|
|
42
|
+
|
|
43
|
+
Output Format:
|
|
44
|
+
Generate a valid JSON object with your evaluation:
|
|
45
|
+
{
|
|
46
|
+
"accuracy": {
|
|
47
|
+
"score": 0,
|
|
48
|
+
"explanation": "Explain why this score was given."
|
|
49
|
+
},
|
|
50
|
+
"completeness": {
|
|
51
|
+
"score": 0,
|
|
52
|
+
"explanation": "Explain why this score was given."
|
|
53
|
+
},
|
|
54
|
+
"clarity": {
|
|
55
|
+
"score": 0,
|
|
56
|
+
"explanation": "Explain why this score was given."
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
Ensure the output is valid JSON:
|
|
61
|
+
- Use **double quotes** (") for all keys and string values.
|
|
62
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
63
|
+
maintain valid JSON formatting.
|
|
64
|
+
- Do not include any additional information in the output.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
68
|
+
"accuracy": {"score", "explanation"},
|
|
69
|
+
"completeness": {"score", "explanation"},
|
|
70
|
+
"clarity": {"score", "explanation"},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class MedalignAnnotator(LLMAsJuryAnnotator):
|
|
75
|
+
"""The Medalign autograder."""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
auto_client: AutoClient,
|
|
80
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
81
|
+
template_name: Optional[str] = None,
|
|
82
|
+
):
|
|
83
|
+
super().__init__(
|
|
84
|
+
name="medalign",
|
|
85
|
+
auto_client=auto_client,
|
|
86
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
87
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
88
|
+
annotator_models=annotator_models,
|
|
89
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
|
|
8
|
+
response to a consumer health question.
|
|
9
|
+
Your goal is to assess how well the generated response answers the medical question,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The question will be provided in these tags:
|
|
13
|
+
<question>
|
|
14
|
+
{QUESTION}
|
|
15
|
+
</question>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{RESPONSE}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference answer will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{GOLD_RESPONSE}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully analyze the <response> compared to the <gold_response> and the original <question>.
|
|
28
|
+
For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
29
|
+
and provide a short justification for your score.
|
|
30
|
+
|
|
31
|
+
Evaluation Criteria:
|
|
32
|
+
Accuracy (1-5)
|
|
33
|
+
- Is the medical information correct?
|
|
34
|
+
|
|
35
|
+
Completeness (1-5)
|
|
36
|
+
- Does the response fully answer the question and include important medical details?
|
|
37
|
+
|
|
38
|
+
Clarity (1-5)
|
|
39
|
+
- Is the response easy for consumers to understand?
|
|
40
|
+
|
|
41
|
+
Output Format:
|
|
42
|
+
Output the evaluation as a single valid JSON object matching the following structure:
|
|
43
|
+
{
|
|
44
|
+
"accuracy": {
|
|
45
|
+
"score": 0,
|
|
46
|
+
"explanation": "Explain why this score was given."
|
|
47
|
+
},
|
|
48
|
+
"completeness": {
|
|
49
|
+
"score": 0,
|
|
50
|
+
"explanation": "Explain why this score was given."
|
|
51
|
+
},
|
|
52
|
+
"clarity": {
|
|
53
|
+
"score": 0,
|
|
54
|
+
"explanation": "Explain why this score was given."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
Ensure the output is valid JSON:
|
|
59
|
+
- Use **double quotes** (") for all keys and string values.
|
|
60
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
61
|
+
maintain valid JSON formatting.
|
|
62
|
+
- Do not include any additional information in the output.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
66
|
+
"accuracy": {"score", "explanation"},
|
|
67
|
+
"completeness": {"score", "explanation"},
|
|
68
|
+
"clarity": {"score", "explanation"},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class MediQAAnnotator(LLMAsJuryAnnotator):
|
|
73
|
+
"""The MediQA autograder."""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
auto_client: AutoClient,
|
|
78
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
79
|
+
template_name: Optional[str] = None,
|
|
80
|
+
):
|
|
81
|
+
super().__init__(
|
|
82
|
+
name="medi_qa",
|
|
83
|
+
auto_client=auto_client,
|
|
84
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
85
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
86
|
+
annotator_models=annotator_models,
|
|
87
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating responses to consumer medication questions.
|
|
8
|
+
|
|
9
|
+
Your goal is to assess how well the response captures the information asked,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The question provided in these tags:
|
|
13
|
+
<medication_question>
|
|
14
|
+
{QUESTION}
|
|
15
|
+
</medication_question>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{RESPONSE}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{GOLD_RESPONSE}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully review the <response> and compare it to the <gold_response> when needed.
|
|
28
|
+
Please evaluate the response on a scale of 1-5 (1 = poor, 5 = excellent) for each of these three key dimensions:
|
|
29
|
+
|
|
30
|
+
Evaluation Criteria:
|
|
31
|
+
Accuracy (1-5)
|
|
32
|
+
- Is the information about the medication factually correct?
|
|
33
|
+
|
|
34
|
+
Completeness (1-5)
|
|
35
|
+
- Does the response fully answer the question, covering all important points?
|
|
36
|
+
|
|
37
|
+
Clarity (1-5)
|
|
38
|
+
- Is the response easy for a consumer to understand?
|
|
39
|
+
|
|
40
|
+
Output Format:
|
|
41
|
+
Output your evaluation as a single valid JSON object matching the following structure:
|
|
42
|
+
{
|
|
43
|
+
"accuracy": {
|
|
44
|
+
"score": 0,
|
|
45
|
+
"explanation": "Brief explanation of why this score was given."
|
|
46
|
+
},
|
|
47
|
+
"completeness": {
|
|
48
|
+
"score": 0,
|
|
49
|
+
"explanation": "Brief explanation of why this score was given."
|
|
50
|
+
},
|
|
51
|
+
"clarity": {
|
|
52
|
+
"score": 0,
|
|
53
|
+
"explanation": "Brief explanation of why this score was given."
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
Ensure the output is valid JSON:
|
|
58
|
+
- Use **double quotes** (") for all keys and string values.
|
|
59
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
60
|
+
maintain valid JSON formatting.
|
|
61
|
+
- Do not include any additional information in the output.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
65
|
+
"accuracy": {"score", "explanation"},
|
|
66
|
+
"completeness": {"score", "explanation"},
|
|
67
|
+
"clarity": {"score", "explanation"},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class MedicationQAAnnotator(LLMAsJuryAnnotator):
|
|
72
|
+
"""The MedicationQA autograder."""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
auto_client: AutoClient,
|
|
77
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
78
|
+
template_name: Optional[str] = None,
|
|
79
|
+
):
|
|
80
|
+
super().__init__(
|
|
81
|
+
name="medication_qa",
|
|
82
|
+
auto_client=auto_client,
|
|
83
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
84
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
85
|
+
annotator_models=annotator_models,
|
|
86
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are a mental health expert tasked with evaluating the quality
|
|
8
|
+
of a counselor response in a therapy conversation.
|
|
9
|
+
Your goal is to assess how well the generated response demonstrates therapeutic skills,
|
|
10
|
+
and how it compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
The conversation history and question will be provided in these tags:
|
|
13
|
+
<conversation>
|
|
14
|
+
{QUESTION}
|
|
15
|
+
</conversation>
|
|
16
|
+
|
|
17
|
+
The response will be provided in these tags:
|
|
18
|
+
<response>
|
|
19
|
+
{RESPONSE}
|
|
20
|
+
</response>
|
|
21
|
+
|
|
22
|
+
The reference response will be provided in these tags:
|
|
23
|
+
<gold_response>
|
|
24
|
+
{GOLD_RESPONSE}
|
|
25
|
+
</gold_response>
|
|
26
|
+
|
|
27
|
+
Carefully analyze the <response> compared to the <gold_response> and the original <conversation>.
|
|
28
|
+
For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
29
|
+
and provide a short justification for your score.
|
|
30
|
+
|
|
31
|
+
Evaluation Criteria:
|
|
32
|
+
Accuracy (1-5)
|
|
33
|
+
- Does the response use appropriate therapeutic techniques and show understanding of the client's concerns?
|
|
34
|
+
|
|
35
|
+
Completeness (1-5)
|
|
36
|
+
- Does the response address the client's key concerns and include important context from the conversation?
|
|
37
|
+
|
|
38
|
+
Clarity (1-5)
|
|
39
|
+
- Is the response easy to understand and communicated in an empathetic, therapeutic way?
|
|
40
|
+
|
|
41
|
+
Output Format:
|
|
42
|
+
Output the evaluation as a single valid JSON object matching the following structure:
|
|
43
|
+
{
|
|
44
|
+
"accuracy": {
|
|
45
|
+
"score": 0,
|
|
46
|
+
"explanation": "Explain why this score was given."
|
|
47
|
+
},
|
|
48
|
+
"completeness": {
|
|
49
|
+
"score": 0,
|
|
50
|
+
"explanation": "Explain why this score was given."
|
|
51
|
+
},
|
|
52
|
+
"clarity": {
|
|
53
|
+
"score": 0,
|
|
54
|
+
"explanation": "Explain why this score was given."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
Ensure the output is valid JSON:
|
|
59
|
+
- Use **double quotes** (") for all keys and string values.
|
|
60
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
61
|
+
maintain valid JSON formatting.
|
|
62
|
+
- Do not include any additional information in the output.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
66
|
+
"accuracy": {"score", "explanation"},
|
|
67
|
+
"completeness": {"score", "explanation"},
|
|
68
|
+
"clarity": {"score", "explanation"},
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class MentalHealthAnnotator(LLMAsJuryAnnotator):
|
|
73
|
+
"""The MentalHealth autograder."""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
auto_client: AutoClient,
|
|
78
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
79
|
+
template_name: Optional[str] = None,
|
|
80
|
+
):
|
|
81
|
+
super().__init__(
|
|
82
|
+
name="mental_health",
|
|
83
|
+
auto_client=auto_client,
|
|
84
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
85
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
86
|
+
annotator_models=annotator_models,
|
|
87
|
+
)
|