crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
import cattrs
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
8
8
|
from helm.common.object_spec import ObjectSpec
|
|
9
9
|
from helm.benchmark.model_metadata_registry import (
|
|
10
10
|
ModelMetadata,
|
|
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
|
|
|
104
104
|
try:
|
|
105
105
|
model_metadata = get_model_metadata(model_name)
|
|
106
106
|
except ValueError:
|
|
107
|
-
|
|
108
|
-
f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
|
|
109
|
-
)
|
|
107
|
+
hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
|
|
110
108
|
model_metadata = get_unknown_model_metadata(model_name)
|
|
111
109
|
register_model_metadata(model_metadata)
|
|
112
110
|
deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
|
|
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
|
|
|
130
128
|
raise ValueError(f"Model deployment {name} not found")
|
|
131
129
|
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
|
|
132
130
|
if deployment.deprecated and warn_deprecated:
|
|
133
|
-
|
|
131
|
+
hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
|
|
134
132
|
return deployment
|
|
135
133
|
|
|
136
134
|
|
|
@@ -148,3 +146,69 @@ def get_model_names_with_tokenizer(tokenizer_name: str) -> List[str]:
|
|
|
148
146
|
deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.tokenizer_name == tokenizer_name
|
|
149
147
|
]
|
|
150
148
|
return [deployment.model_name or deployment.name for deployment in deployments]
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def get_default_model_deployment_for_model(
|
|
152
|
+
model_name: str, warn_arg_deprecated: bool = False, ignore_deprecated: bool = False
|
|
153
|
+
) -> Optional[str]:
|
|
154
|
+
"""Returns a valid model deployment name corresponding to the given model arg.
|
|
155
|
+
This is used as a backwards compatibility layer for model names that are now moved to model deployments.
|
|
156
|
+
Example: "anthropic/claude-v1.3" => "anthropic/claude-v1.3"
|
|
157
|
+
Example: "meta/llama-7b" => "together/llama-7b"
|
|
158
|
+
|
|
159
|
+
The process to find a model deployment name is as follows:
|
|
160
|
+
1. If there is at least one deployment for the model, use the last one that is available.
|
|
161
|
+
2. If there are no deployments for the model, returns None.
|
|
162
|
+
|
|
163
|
+
This function will also try to find a model deployment name that is not deprecated.
|
|
164
|
+
If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
|
|
165
|
+
If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
|
|
166
|
+
|
|
167
|
+
If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
|
|
168
|
+
as the model arg. This is to remind the user that the model name is deprecated and should be replaced with
|
|
169
|
+
the model deployment name (in their config).
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
model_arg: The model arg to convert to a model deployment name.
|
|
173
|
+
warn_arg_deprecated: Whether to print a warning if the model deployment name is not the same as the model arg.
|
|
174
|
+
ignore_deprecated: Whether to return None if the model deployment is deprecated.
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
# If there is at least one deployment for the model, use the last one that is available.
|
|
178
|
+
available_deployments: List[ModelDeployment] = [
|
|
179
|
+
deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
|
|
180
|
+
]
|
|
181
|
+
if len(available_deployments) > 0:
|
|
182
|
+
available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
|
|
183
|
+
if warn_arg_deprecated:
|
|
184
|
+
hwarn("Model name is deprecated. Please use the model deployment name instead.")
|
|
185
|
+
hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
|
|
186
|
+
|
|
187
|
+
# Additionally, if there is a non-deprecated deployment, use it.
|
|
188
|
+
non_deprecated_deployments: List[ModelDeployment] = [
|
|
189
|
+
deployment for deployment in available_deployments if not deployment.deprecated
|
|
190
|
+
]
|
|
191
|
+
if len(non_deprecated_deployments) > 0:
|
|
192
|
+
chosen_deployment = non_deprecated_deployments[-1]
|
|
193
|
+
# There are no non-deprecated deployments, so there are two options:
|
|
194
|
+
# 1. If we can return an empty string, return it. (no model deployment is available)
|
|
195
|
+
# 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
|
|
196
|
+
elif ignore_deprecated:
|
|
197
|
+
return None
|
|
198
|
+
elif len(available_deployments) > 0:
|
|
199
|
+
chosen_deployment = available_deployments[-1]
|
|
200
|
+
if warn_arg_deprecated:
|
|
201
|
+
hwarn(f"All model deployments for model {model_name} are deprecated.")
|
|
202
|
+
else:
|
|
203
|
+
return None
|
|
204
|
+
if warn_arg_deprecated:
|
|
205
|
+
hlog(
|
|
206
|
+
f"Choosing {chosen_deployment.name} (the last one) as "
|
|
207
|
+
f"the default model deployment for model {model_name}"
|
|
208
|
+
)
|
|
209
|
+
hlog("If you want to use a different model deployment, please specify it explicitly.")
|
|
210
|
+
return chosen_deployment.name
|
|
211
|
+
|
|
212
|
+
# Some models are added but have no deployments yet.
|
|
213
|
+
# In this case, we return None.
|
|
214
|
+
return None
|
|
@@ -22,11 +22,18 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
|
|
|
22
22
|
# OpenAI Chat format
|
|
23
23
|
OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
|
|
24
24
|
|
|
25
|
+
# For NOVA models
|
|
26
|
+
NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
|
|
27
|
+
|
|
25
28
|
# For Anthropic models
|
|
26
29
|
ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
|
|
27
30
|
ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
|
|
31
|
+
ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
|
|
28
32
|
|
|
29
33
|
GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
|
|
34
|
+
GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
|
|
35
|
+
GOOGLE_GEMINI_PRO_VISION_V1_TAG: str = "GOOGLE_GEMINI_PRO_VISION_V1_TAG"
|
|
36
|
+
GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
|
|
30
37
|
|
|
31
38
|
# Models which emit garbage tokens when temperature=0.
|
|
32
39
|
BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG"
|
|
@@ -46,12 +53,38 @@ NLG_PREFIX_TAG: str = "NLG_PREFIX_TAG"
|
|
|
46
53
|
# Some models can follow instructions.
|
|
47
54
|
INSTRUCTION_FOLLOWING_MODEL_TAG: str = "INSTRUCTION_FOLLOWING_MODEL_TAG"
|
|
48
55
|
|
|
56
|
+
# For text-to-image models
|
|
57
|
+
TEXT_TO_IMAGE_MODEL_TAG: str = "TEXT_TO_IMAGE_MODEL_TAG"
|
|
58
|
+
|
|
49
59
|
# For Vision-langauge models (VLMs)
|
|
50
60
|
VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
|
|
51
|
-
|
|
61
|
+
# IDEFICS require a special prompt format (see `IDEFICSInstructRunExpander`)
|
|
62
|
+
IDEFICS_INSTRUCT_MODEL_TAG: str = "IDEFICS_INSTRUCT_MODEL_TAG"
|
|
63
|
+
IDEFICS_MODEL_TAG: str = "IDEFICS_MODEL_TAG"
|
|
64
|
+
# Llava should use a special prompt format (see `LlavaRunExpander`)
|
|
65
|
+
LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG"
|
|
66
|
+
# OpenFlamingo has a special prompt format (see `OpenFlamingoRunExpander`)
|
|
67
|
+
OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
|
|
68
|
+
# Some VLMs do not support multiple images in the prompt
|
|
69
|
+
LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
|
|
70
|
+
FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
|
|
71
|
+
|
|
72
|
+
# For Audio-langauge models (AudioLMs)
|
|
73
|
+
AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
|
|
74
|
+
|
|
75
|
+
# Deprecated models that are no longer available.
|
|
76
|
+
# These are usually closed API models that have been permanently removed
|
|
77
|
+
DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
|
|
78
|
+
|
|
79
|
+
# Unsupported models.
|
|
80
|
+
# These are models that we have chosen not to support because they are
|
|
81
|
+
# private, stale, non-notable, or difficult to implement.
|
|
82
|
+
UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
|
|
52
83
|
|
|
53
84
|
# Frozen is set to false as the model_deployment_registry.py file
|
|
54
85
|
# might populate the deployment_names field.
|
|
86
|
+
|
|
87
|
+
|
|
55
88
|
@dataclass(frozen=False)
|
|
56
89
|
class ModelMetadata:
|
|
57
90
|
name: str
|
|
@@ -138,7 +171,10 @@ def register_model_metadata(model_metadata: ModelMetadata) -> None:
|
|
|
138
171
|
def get_model_metadata(model_name: str) -> ModelMetadata:
|
|
139
172
|
"""Return the `ModelMetadata` for the model name."""
|
|
140
173
|
if model_name not in MODEL_NAME_TO_MODEL_METADATA:
|
|
141
|
-
raise ValueError(
|
|
174
|
+
raise ValueError(
|
|
175
|
+
f"No model metadata for model name: {model_name} - "
|
|
176
|
+
"did you remember to add this model to model_metadata.yaml?"
|
|
177
|
+
)
|
|
142
178
|
|
|
143
179
|
return MODEL_NAME_TO_MODEL_METADATA[model_name]
|
|
144
180
|
|
|
@@ -153,6 +189,11 @@ def get_model_names_with_tag(tag: str) -> List[str]:
|
|
|
153
189
|
return [model.name for model in ALL_MODELS_METADATA if tag in model.tags]
|
|
154
190
|
|
|
155
191
|
|
|
192
|
+
def model_has_tag(model_name: str, tag: str) -> bool:
|
|
193
|
+
"""Return True if the model has the given tag. False otherwise."""
|
|
194
|
+
return tag in get_model_metadata(model_name).tags
|
|
195
|
+
|
|
196
|
+
|
|
156
197
|
def get_all_text_models() -> List[str]:
|
|
157
198
|
"""Return all model names of text models."""
|
|
158
199
|
return get_model_names_with_tag(TEXT_MODEL_TAG)
|
|
@@ -168,6 +209,21 @@ def get_all_instruction_following_models() -> List[str]:
|
|
|
168
209
|
return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
|
|
169
210
|
|
|
170
211
|
|
|
212
|
+
def is_text_to_image_model(model_name: str) -> bool:
|
|
213
|
+
"""Returns True if the model is a text-to-image model. False otherwise."""
|
|
214
|
+
return model_has_tag(model_name, TEXT_TO_IMAGE_MODEL_TAG)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def is_vlm(model_name: str) -> bool:
|
|
218
|
+
"""Returns True if the model is a vision-language model (VLM). False otherwise."""
|
|
219
|
+
return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def is_audiolm(model_name: str) -> bool:
|
|
223
|
+
"""Returns True if the model is a audio-language model (AudioLM). False otherwise."""
|
|
224
|
+
return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
|
|
225
|
+
|
|
226
|
+
|
|
171
227
|
def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
|
|
172
228
|
"""Return placeholder ModelMetadata for an unknown model."""
|
|
173
229
|
return ModelMetadata(
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import signal
|
|
2
|
+
import threading
|
|
3
|
+
import traceback
|
|
4
|
+
from typing import List
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
import torch
|
|
8
|
+
import torch.multiprocessing as multiprocessing
|
|
9
|
+
from concurrent.futures import ProcessPoolExecutor as Pool
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from helm.benchmark.config_registry import (
|
|
13
|
+
register_configs_from_directory,
|
|
14
|
+
register_builtin_configs_from_helm_package,
|
|
15
|
+
)
|
|
16
|
+
from helm.benchmark.executor import ExecutionSpec
|
|
17
|
+
from helm.benchmark.runner import Runner, RunSpec, RunnerError
|
|
18
|
+
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
19
|
+
from helm.benchmark.runner_config_registry import RUNNER_CONFIG
|
|
20
|
+
|
|
21
|
+
_MAX_CONCURRENT_WORKERS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKERS"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# From
|
|
25
|
+
# https://stackoverflow.com/questions/71300294/how-to-terminate-pythons-processpoolexecutor-when-parent-process-dies
|
|
26
|
+
def start_thread_to_terminate_when_parent_process_dies(ppid):
|
|
27
|
+
pid = os.getpid()
|
|
28
|
+
|
|
29
|
+
def f():
|
|
30
|
+
while True:
|
|
31
|
+
try:
|
|
32
|
+
os.kill(ppid, 0)
|
|
33
|
+
except OSError:
|
|
34
|
+
os.kill(pid, signal.SIGTERM)
|
|
35
|
+
time.sleep(1)
|
|
36
|
+
|
|
37
|
+
thread = threading.Thread(target=f, daemon=True)
|
|
38
|
+
thread.start()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def initialize_worker(gpu_id: int):
|
|
42
|
+
hlog(f"Worker {gpu_id} initializing")
|
|
43
|
+
|
|
44
|
+
# Wait for 0.1 seconds to ensure all workers are initialized with different CUDA_VISIBLE_DEVICES
|
|
45
|
+
time.sleep(0.1)
|
|
46
|
+
|
|
47
|
+
# Pin GPU to worker process
|
|
48
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
|
49
|
+
|
|
50
|
+
# Necessary for code_metrics in humaneval to work properly
|
|
51
|
+
multiprocessing.set_start_method("fork", force=True)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class MultiGPURunner(Runner):
|
|
55
|
+
"""Runner that runs the entire benchmark on multiple GPUs.
|
|
56
|
+
|
|
57
|
+
This is a thin wrapper around `Runner` that runs the entire benchmark on
|
|
58
|
+
multiple GPUs using `multiprocessing`.
|
|
59
|
+
|
|
60
|
+
Note that this runner will load multiple models into memory at the same
|
|
61
|
+
time if your running configuration specifies that, similar to the `Runner`
|
|
62
|
+
class. `SlurmRunner` on the other hand will load at most one model on a
|
|
63
|
+
GPU"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
execution_spec: ExecutionSpec,
|
|
68
|
+
output_path: str,
|
|
69
|
+
suite: str,
|
|
70
|
+
skip_instances: bool,
|
|
71
|
+
cache_instances: bool,
|
|
72
|
+
cache_instances_only: bool,
|
|
73
|
+
skip_completed_runs: bool,
|
|
74
|
+
exit_on_error: bool,
|
|
75
|
+
):
|
|
76
|
+
super().__init__(
|
|
77
|
+
execution_spec=execution_spec,
|
|
78
|
+
output_path=output_path,
|
|
79
|
+
suite=suite,
|
|
80
|
+
skip_instances=skip_instances,
|
|
81
|
+
cache_instances=cache_instances,
|
|
82
|
+
cache_instances_only=cache_instances_only,
|
|
83
|
+
skip_completed_runs=skip_completed_runs,
|
|
84
|
+
exit_on_error=exit_on_error,
|
|
85
|
+
)
|
|
86
|
+
# Configure max concurrent worker jobs from the environment variable.
|
|
87
|
+
env_max_concurrent_workers = os.getenv(_MAX_CONCURRENT_WORKERS_ENV_NAME)
|
|
88
|
+
self.max_concurrent_workers = (
|
|
89
|
+
int(env_max_concurrent_workers)
|
|
90
|
+
if env_max_concurrent_workers
|
|
91
|
+
else (
|
|
92
|
+
RUNNER_CONFIG.helm_max_concurrent_workers
|
|
93
|
+
if RUNNER_CONFIG.helm_max_concurrent_workers > 0
|
|
94
|
+
else torch.cuda.device_count()
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def safe_run_one(self, run_spec: RunSpec):
|
|
99
|
+
register_builtin_configs_from_helm_package()
|
|
100
|
+
if self.executor.execution_spec.local_path is not None:
|
|
101
|
+
register_configs_from_directory(self.executor.execution_spec.local_path)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
with htrack_block(f"Running {run_spec.name}"):
|
|
105
|
+
self.run_one(run_spec)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
|
|
108
|
+
return e
|
|
109
|
+
|
|
110
|
+
def run_all(self, run_specs: List[RunSpec]):
|
|
111
|
+
"""Run the entire benchmark on multiple GPU"""
|
|
112
|
+
|
|
113
|
+
# Set the start method to forkserver to avoid issues with CUDA.
|
|
114
|
+
multiprocessing.set_start_method("forkserver")
|
|
115
|
+
|
|
116
|
+
with Pool(
|
|
117
|
+
max_workers=self.max_concurrent_workers,
|
|
118
|
+
initializer=start_thread_to_terminate_when_parent_process_dies,
|
|
119
|
+
initargs=(os.getpid(),),
|
|
120
|
+
) as pool:
|
|
121
|
+
# Pin GPUs to each worker process
|
|
122
|
+
pool.map(initialize_worker, [i for i in range(self.max_concurrent_workers)])
|
|
123
|
+
|
|
124
|
+
# Run all queued tasks
|
|
125
|
+
error_msgs = list(tqdm(pool.map(self.safe_run_one, run_specs), total=len(run_specs), disable=None))
|
|
126
|
+
|
|
127
|
+
# Raise exception for failed runs, if any.
|
|
128
|
+
failed_run_names = [
|
|
129
|
+
run_spec.name for error_msg, run_spec in zip(error_msgs, run_specs) if error_msg is not None
|
|
130
|
+
]
|
|
131
|
+
if failed_run_names:
|
|
132
|
+
failed_runs_str = ", ".join([f'"{run_name}"' for run_name in failed_run_names])
|
|
133
|
+
raise RunnerError(f"Failed runs: [{failed_runs_str}]")
|
|
@@ -4,7 +4,7 @@ import dacite
|
|
|
4
4
|
import importlib_resources as resources
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import htrack, hlog
|
|
7
|
+
from helm.common.hierarchical_logger import htrack, hlog, hwarn
|
|
8
8
|
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
9
9
|
from helm.benchmark.presentation.schema import Schema
|
|
10
10
|
|
|
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
|
|
|
71
71
|
for point in contamination.points:
|
|
72
72
|
for model in point.models:
|
|
73
73
|
if model not in MODEL_NAME_TO_MODEL_METADATA:
|
|
74
|
-
|
|
74
|
+
hwarn(f"model {model} not defined in schema")
|
|
75
75
|
for group in point.groups:
|
|
76
76
|
if group not in schema.name_to_run_group:
|
|
77
|
-
|
|
77
|
+
hwarn(f"group {group} not defined in schema")
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
def read_contamination():
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
#
|
|
1
|
+
# type: ignore
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
2
5
|
import argparse
|
|
3
6
|
from collections import defaultdict
|
|
4
7
|
from dataclasses import dataclass
|
|
@@ -10,10 +13,10 @@ from typing import List, Dict, Optional, Any, Callable, Union, Mapping, Tuple, S
|
|
|
10
13
|
import numpy as np
|
|
11
14
|
from scipy.stats import pearsonr
|
|
12
15
|
|
|
13
|
-
from helm.
|
|
16
|
+
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
|
|
17
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
14
18
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
15
|
-
from helm.benchmark.
|
|
16
|
-
from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
|
|
19
|
+
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
17
20
|
|
|
18
21
|
try:
|
|
19
22
|
import colorcet
|
|
@@ -38,6 +41,7 @@ metric_group_to_label = {
|
|
|
38
41
|
"Efficiency": f"Inference time (s) {DOWN_ARROW}",
|
|
39
42
|
}
|
|
40
43
|
all_metric_groups = list(metric_group_to_label.keys())
|
|
44
|
+
AGGREGATE_WIN_RATE_COLUMN = 1
|
|
41
45
|
|
|
42
46
|
|
|
43
47
|
@dataclass
|
|
@@ -133,9 +137,6 @@ class Plotter:
|
|
|
133
137
|
self.plot_format = plot_format
|
|
134
138
|
self._tables_cache: Dict[str, Dict[str, Table]] = {}
|
|
135
139
|
|
|
136
|
-
schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
|
|
137
|
-
self.model_metadata = {model_field.display_name: model_field for model_field in schema.models}
|
|
138
|
-
|
|
139
140
|
def get_group_tables(self, group_name: str) -> Dict[str, Table]:
|
|
140
141
|
"""Reads and parses group tables. Uses _tables_cache to avoid reprocessing the same table multiple times."""
|
|
141
142
|
if group_name in self._tables_cache:
|
|
@@ -338,14 +339,14 @@ class Plotter:
|
|
|
338
339
|
|
|
339
340
|
def get_model_release_date(model_name: str) -> Optional[date]:
|
|
340
341
|
"""Maps a model name to the month of model release."""
|
|
341
|
-
release_date =
|
|
342
|
+
release_date = MODEL_NAME_TO_MODEL_METADATA[model_name].release_date
|
|
342
343
|
if release_date is None:
|
|
343
344
|
return None
|
|
344
345
|
return release_date.replace(day=1)
|
|
345
346
|
|
|
346
347
|
def get_model_size(model_name: str) -> Optional[int]:
|
|
347
348
|
"""Maps a model name to the number of parameters, rounding to the nearest leading digit."""
|
|
348
|
-
size =
|
|
349
|
+
size = MODEL_NAME_TO_MODEL_METADATA[model_name].num_parameters
|
|
349
350
|
if size is None:
|
|
350
351
|
return None
|
|
351
352
|
grain = 10 ** (len(str(size)) - 1)
|
|
@@ -401,7 +402,9 @@ class Plotter:
|
|
|
401
402
|
|
|
402
403
|
for i, access_level in enumerate(access_levels):
|
|
403
404
|
model_indices: List[int] = [
|
|
404
|
-
idx
|
|
405
|
+
idx
|
|
406
|
+
for idx, model in enumerate(table.adapters)
|
|
407
|
+
if MODEL_NAME_TO_MODEL_METADATA[model].access == access_level
|
|
405
408
|
]
|
|
406
409
|
best_model_index = model_indices[table.mean_win_rates[model_indices].argmax()]
|
|
407
410
|
|
|
@@ -600,6 +603,17 @@ class Plotter:
|
|
|
600
603
|
self.create_constrast_set_plots()
|
|
601
604
|
|
|
602
605
|
|
|
606
|
+
def create_plots(args):
|
|
607
|
+
register_builtin_configs_from_helm_package()
|
|
608
|
+
base_path = os.path.join(args.output_path, "runs", args.suite)
|
|
609
|
+
if not os.path.exists(os.path.join(base_path, "groups")):
|
|
610
|
+
hlog(f"ERROR: Could not find `groups` directory under {base_path}. Did you run `summarize.py` first?")
|
|
611
|
+
return
|
|
612
|
+
save_path = os.path.join(base_path, "plots")
|
|
613
|
+
plotter = Plotter(base_path=base_path, save_path=save_path, plot_format=args.plot_format)
|
|
614
|
+
plotter.create_all_plots()
|
|
615
|
+
|
|
616
|
+
|
|
603
617
|
def main():
|
|
604
618
|
"""
|
|
605
619
|
This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
|
|
@@ -607,17 +621,34 @@ def main():
|
|
|
607
621
|
the top-level command `helm-create-plots`.
|
|
608
622
|
"""
|
|
609
623
|
parser = argparse.ArgumentParser()
|
|
610
|
-
parser.add_argument(
|
|
611
|
-
|
|
612
|
-
|
|
624
|
+
parser.add_argument(
|
|
625
|
+
"-o",
|
|
626
|
+
"--output-path",
|
|
627
|
+
type=str,
|
|
628
|
+
help="Path to benchmarking output",
|
|
629
|
+
default="benchmark_output",
|
|
630
|
+
)
|
|
631
|
+
parser.add_argument(
|
|
632
|
+
"--suite",
|
|
633
|
+
type=str,
|
|
634
|
+
help="Name of the suite that we are plotting",
|
|
635
|
+
required=True,
|
|
636
|
+
)
|
|
637
|
+
parser.add_argument(
|
|
638
|
+
"--plot-format",
|
|
639
|
+
help="Format for saving plots",
|
|
640
|
+
default="png",
|
|
641
|
+
choices=["png", "pdf"],
|
|
642
|
+
)
|
|
643
|
+
parser.add_argument(
|
|
644
|
+
"--log-config",
|
|
645
|
+
type=str,
|
|
646
|
+
default=None,
|
|
647
|
+
help="PATH to a YAML file to customize logging",
|
|
648
|
+
)
|
|
613
649
|
args = parser.parse_args()
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
hlog(f"ERROR: Could not find `groups` directory under {base_path}. Did you run `summarize.py` first?")
|
|
617
|
-
return
|
|
618
|
-
save_path = os.path.join(base_path, "plots")
|
|
619
|
-
plotter = Plotter(base_path=base_path, save_path=save_path, plot_format=args.plot_format)
|
|
620
|
-
plotter.create_all_plots()
|
|
650
|
+
setup_default_logging(args.log_config)
|
|
651
|
+
create_plots(args)
|
|
621
652
|
|
|
622
653
|
|
|
623
654
|
if __name__ == "__main__":
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from collections import OrderedDict, defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import os
|
|
4
|
-
|
|
4
|
+
import re
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
|
|
5
6
|
|
|
6
|
-
from helm.benchmark.adaptation.
|
|
7
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
7
8
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
|
|
8
9
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
9
10
|
)
|
|
@@ -12,11 +13,13 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
12
13
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
13
14
|
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
14
15
|
from helm.benchmark.metrics.metric import PerInstanceStats
|
|
16
|
+
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
15
17
|
from helm.benchmark.presentation.schema import Schema
|
|
16
|
-
from helm.benchmark.
|
|
18
|
+
from helm.benchmark.run_spec import RunSpec
|
|
17
19
|
from helm.benchmark.scenarios.scenario import Instance
|
|
18
20
|
from helm.common.general import write
|
|
19
21
|
from helm.common.hierarchical_logger import hlog, htrack
|
|
22
|
+
from helm.common.images_utils import encode_base64
|
|
20
23
|
from helm.common.request import Request
|
|
21
24
|
from helm.common.codec import from_json, to_json
|
|
22
25
|
|
|
@@ -43,6 +46,9 @@ class DisplayPrediction:
|
|
|
43
46
|
truncated_predicted_text: Optional[str]
|
|
44
47
|
"""The truncated prediction text, if truncation is required by the Adapter method."""
|
|
45
48
|
|
|
49
|
+
base64_images: Optional[List[str]]
|
|
50
|
+
"""Images in base64."""
|
|
51
|
+
|
|
46
52
|
mapped_output: Optional[str]
|
|
47
53
|
"""The mapped output, if an output mapping exists and the prediction can be mapped"""
|
|
48
54
|
|
|
@@ -52,6 +58,11 @@ class DisplayPrediction:
|
|
|
52
58
|
stats: Dict[str, float]
|
|
53
59
|
"""Statistics computed from the predicted output"""
|
|
54
60
|
|
|
61
|
+
annotations: Optional[Dict[str, Any]]
|
|
62
|
+
|
|
63
|
+
thinking_text: Optional[str]
|
|
64
|
+
"""Thinking text from thinking models."""
|
|
65
|
+
|
|
55
66
|
|
|
56
67
|
@dataclass(frozen=True)
|
|
57
68
|
class DisplayRequest:
|
|
@@ -73,7 +84,7 @@ class DisplayRequest:
|
|
|
73
84
|
"""The actual Request to display in the web frontend.
|
|
74
85
|
|
|
75
86
|
There can be multiple requests per trial. The displayed request should be the
|
|
76
|
-
most relevant request e.g. the request for the chosen
|
|
87
|
+
most relevant request e.g. the request for the chosen choice for multiple choice questions."""
|
|
77
88
|
|
|
78
89
|
|
|
79
90
|
def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
|
|
@@ -102,8 +113,7 @@ def _truncate_predicted_text(
|
|
|
102
113
|
tokens = request_state.result.completions[0].tokens
|
|
103
114
|
if tokens:
|
|
104
115
|
first_token = tokens[0]
|
|
105
|
-
|
|
106
|
-
prefix = first_token.text
|
|
116
|
+
prefix = first_token.text
|
|
107
117
|
if prefix:
|
|
108
118
|
predicted_text = predicted_text
|
|
109
119
|
prefix = prefix
|
|
@@ -126,7 +136,7 @@ def _get_metric_names_for_group(run_group_name: str, schema: Schema) -> Set[str]
|
|
|
126
136
|
if metric_group is None:
|
|
127
137
|
continue
|
|
128
138
|
for metric_name_matcher in metric_group.metrics:
|
|
129
|
-
if metric_name_matcher.perturbation_name:
|
|
139
|
+
if metric_name_matcher.perturbation_name and metric_name_matcher.perturbation_name != "__all__":
|
|
130
140
|
continue
|
|
131
141
|
result.add(metric_name_matcher.substitute(run_group.environment).name)
|
|
132
142
|
return result
|
|
@@ -253,12 +263,38 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
253
263
|
if request_state.result is not None and request_state.result.completions
|
|
254
264
|
else ""
|
|
255
265
|
)
|
|
256
|
-
mapped_output =
|
|
257
|
-
|
|
266
|
+
mapped_output: Optional[str] = None
|
|
267
|
+
if request_state.output_mapping is not None:
|
|
268
|
+
output_to_map = predicted_text.strip()
|
|
269
|
+
if run_spec.adapter_spec.output_mapping_pattern:
|
|
270
|
+
match = re.search(run_spec.adapter_spec.output_mapping_pattern, output_to_map)
|
|
271
|
+
if not match:
|
|
272
|
+
output_to_map = ""
|
|
273
|
+
elif match.groups():
|
|
274
|
+
output_to_map = match.group(0)
|
|
275
|
+
else:
|
|
276
|
+
output_to_map = match.string
|
|
277
|
+
mapped_output = request_state.output_mapping.get(output_to_map)
|
|
278
|
+
instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
|
|
279
|
+
request_state.instance
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
if request_state.result.completions[0].multimodal_content:
|
|
283
|
+
additional_prediction: str = request_state.result.completions[0].multimodal_content.text
|
|
284
|
+
if additional_prediction:
|
|
285
|
+
predicted_text = f"{additional_prediction} {predicted_text}"
|
|
286
|
+
|
|
287
|
+
# Process images and include if they exist
|
|
288
|
+
images: List[str] = [
|
|
289
|
+
encode_base64(image_location)
|
|
290
|
+
for image_location in gather_generated_image_locations(request_state.result)
|
|
291
|
+
if os.path.exists(image_location)
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
thinking_text: Optional[str] = (
|
|
295
|
+
request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
|
|
258
296
|
)
|
|
259
|
-
|
|
260
|
-
(request_state.instance.id, request_state.instance.perturbation)
|
|
261
|
-
] = request_state.instance
|
|
297
|
+
|
|
262
298
|
predictions.append(
|
|
263
299
|
DisplayPrediction(
|
|
264
300
|
instance_id=request_state.instance.id,
|
|
@@ -266,9 +302,12 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
266
302
|
train_trial_index=request_state.train_trial_index,
|
|
267
303
|
predicted_text=predicted_text,
|
|
268
304
|
truncated_predicted_text=_truncate_predicted_text(predicted_text, request_state, run_spec.adapter_spec),
|
|
305
|
+
base64_images=images,
|
|
269
306
|
mapped_output=mapped_output,
|
|
270
307
|
reference_index=request_state.reference_index,
|
|
271
308
|
stats=trial_stats,
|
|
309
|
+
annotations=request_state.annotations,
|
|
310
|
+
thinking_text=thinking_text,
|
|
272
311
|
)
|
|
273
312
|
)
|
|
274
313
|
requests.append(
|
|
@@ -14,10 +14,10 @@ class RunEntry:
|
|
|
14
14
|
description: str
|
|
15
15
|
|
|
16
16
|
# Priority for this run spec (1 is highest priority, 5 is lowest priority)
|
|
17
|
-
priority: int
|
|
17
|
+
priority: Optional[int] = None
|
|
18
18
|
|
|
19
19
|
# Additional groups to add to the run spec
|
|
20
|
-
groups: Optional[List[str]]
|
|
20
|
+
groups: Optional[List[str]] = None
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@dataclass(frozen=True)
|