crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -1,6 +1,29 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from typing import List, Optional
|
|
3
3
|
|
|
4
|
+
from helm.common.image_generation_parameters import ImageGenerationParameters
|
|
5
|
+
from helm.common.reeval_parameters import REEvalParameters
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# Adaptation methods
|
|
9
|
+
ADAPT_GENERATION: str = "generation"
|
|
10
|
+
ADAPT_CHAT: str = "chat"
|
|
11
|
+
ADAPT_LANGUAGE_MODELING: str = "language_modeling"
|
|
12
|
+
ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
|
|
13
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
|
|
14
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
|
|
15
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
|
|
16
|
+
ADAPT_RANKING_BINARY: str = "ranking_binary"
|
|
17
|
+
ADAPT_EHR_INSTRUCTION: str = "ehr_instruction"
|
|
18
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
|
|
19
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
20
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
# Multimodal adaptation methods
|
|
24
|
+
ADAPT_GENERATION_MULTIMODAL: str = "generation_multimodal"
|
|
25
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL: str = "multiple_choice_joint_multimodal"
|
|
26
|
+
|
|
4
27
|
|
|
5
28
|
@dataclass(frozen=True)
|
|
6
29
|
class Substitution:
|
|
@@ -19,82 +42,110 @@ class AdapterSpec:
|
|
|
19
42
|
Note that an `Instance` could produce many `Request`s (e.g., one for each `Reference`).
|
|
20
43
|
"""
|
|
21
44
|
|
|
22
|
-
# Method of adaptation
|
|
23
45
|
method: str = ""
|
|
46
|
+
"""The high-level strategy for converting instances into a prompt for the language model."""
|
|
24
47
|
|
|
25
|
-
# Prepend all prompts with this string.
|
|
26
|
-
# For example, it is recommended to prefix all prompts with [NLG] for UL2.
|
|
27
48
|
global_prefix: str = ""
|
|
49
|
+
"""The string that is prepended to the entire prompt."""
|
|
28
50
|
|
|
29
|
-
# Append all prompts with this string.
|
|
30
51
|
global_suffix: str = ""
|
|
52
|
+
"""The string that is appended to the entire prompt."""
|
|
31
53
|
|
|
32
|
-
# Prompt starts with instructions
|
|
33
54
|
instructions: str = ""
|
|
55
|
+
"""The description of the task that is included at the very beginning of the prompt."""
|
|
34
56
|
|
|
35
|
-
# What goes before the input
|
|
36
57
|
input_prefix: str = "Input: "
|
|
58
|
+
"""The string that is included before each input (e.g., 'Question:')."""
|
|
37
59
|
|
|
38
|
-
# What goes after the input
|
|
39
60
|
input_suffix: str = "\n"
|
|
61
|
+
"""The string that is included after each input (e.g., '\\n')."""
|
|
40
62
|
|
|
41
|
-
# What goes before the input (for multiple choice)
|
|
42
63
|
reference_prefix: str = "A. "
|
|
64
|
+
"""The string that is included before each reference (for multiple-choice questions)."""
|
|
65
|
+
|
|
66
|
+
# Set hash=False to make `AdapterSpec` hashable
|
|
67
|
+
reference_prefix_characters: Optional[List[str]] = field(default=None, hash=False)
|
|
68
|
+
"""The characters that are used to identify choices for multiple-choice questions e.g. ["A", "B", "C", "D"].
|
|
69
|
+
If unset, defaults to the sequence of ascending characters starting from the first character of reference_prefix."""
|
|
43
70
|
|
|
44
|
-
# What goes before the input (for multiple choice)
|
|
45
71
|
reference_suffix: str = "\n"
|
|
72
|
+
"""The string that is included after each reference (for multiple-choice questions)."""
|
|
73
|
+
|
|
74
|
+
chain_of_thought_prefix: str = ""
|
|
75
|
+
"""The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
|
|
76
|
+
|
|
77
|
+
chain_of_thought_suffix: str = "\n"
|
|
78
|
+
"""The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
|
|
46
79
|
|
|
47
|
-
# What goes before the output
|
|
48
80
|
output_prefix: str = "Output: "
|
|
81
|
+
"""The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
|
|
49
82
|
|
|
50
|
-
# What goes after the output
|
|
51
83
|
output_suffix: str = "\n"
|
|
84
|
+
"""The string that is included after the correct answer/predicted output (e.g., '\\n')."""
|
|
52
85
|
|
|
53
|
-
# What goes between instruction and in-context example blocks in the constructed prompt
|
|
54
86
|
instance_prefix: str = "\n"
|
|
87
|
+
"""The string that is included before each instance (e.g., '\\n\\n')."""
|
|
55
88
|
|
|
56
|
-
# List of regular expression substitutions that we perform
|
|
57
89
|
substitutions: List[Substitution] = field(default_factory=list, hash=False)
|
|
90
|
+
"""A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n')
|
|
91
|
+
to perform at the very end on the prompt."""
|
|
58
92
|
|
|
59
|
-
# Maximum number of (in-context) training instances to put into the prompt
|
|
60
93
|
max_train_instances: int = 5
|
|
94
|
+
"""Maximum number of training instances to include in the prompt (currently by randomly sampling)."""
|
|
61
95
|
|
|
62
|
-
# Maximum number of evaluation instances. For getting valid numbers, this
|
|
63
|
-
# should be the entire dataset; only reduce this for piloting.
|
|
64
96
|
max_eval_instances: Optional[int] = None
|
|
97
|
+
"""Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."""
|
|
65
98
|
|
|
66
|
-
# Generate this many outputs (which could be realized by `num_completions`
|
|
67
|
-
# or `top_k_per_token`).
|
|
68
99
|
num_outputs: int = 5
|
|
100
|
+
"""Maximum number of possible outputs to generate by sampling multiple outputs."""
|
|
69
101
|
|
|
70
|
-
# Number of trials, where in each trial we choose an independent, random
|
|
71
|
-
# set of training instances. Used to compute error bars.
|
|
72
102
|
num_train_trials: int = 1
|
|
103
|
+
"""Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
104
|
+
Used to compute variance."""
|
|
105
|
+
|
|
106
|
+
num_trials: int = 1
|
|
107
|
+
"""Number of trials, where we query the model with the same requests, but different random seeds."""
|
|
73
108
|
|
|
74
|
-
# If true, randomly sample N training examples; if false, select N consecutive training examples
|
|
75
109
|
sample_train: bool = True
|
|
110
|
+
"""If true, randomly sample N training examples; if false, select N consecutive training examples"""
|
|
76
111
|
|
|
77
112
|
# Decoding parameters (inherited by `Request`)
|
|
78
113
|
|
|
79
|
-
# Model deployment to make the request to (need to fill in)
|
|
80
114
|
model_deployment: str = ""
|
|
115
|
+
"""Name of the language model deployment (<host_organization>/<model name>) to send requests to."""
|
|
81
116
|
|
|
82
|
-
# DEPRECATED: old model field, kept for backward compatibility
|
|
83
|
-
# TODO: Remove this once we do not wish to support backward compatibility anymore.
|
|
84
117
|
model: str = ""
|
|
118
|
+
"""Name of the language model (<creator_organization>/<model name>) to send requests to."""
|
|
85
119
|
|
|
86
|
-
# Temperature to use
|
|
87
120
|
temperature: float = 1
|
|
121
|
+
"""Temperature parameter used in generation."""
|
|
88
122
|
|
|
89
|
-
# Maximum number of tokens to generate
|
|
90
123
|
max_tokens: int = 100
|
|
124
|
+
"""Maximum number of tokens to generate."""
|
|
91
125
|
|
|
92
|
-
#
|
|
126
|
+
# Set hash=False to make `AdapterSpec` hashable
|
|
93
127
|
stop_sequences: List[str] = field(default_factory=list, hash=False)
|
|
128
|
+
"""List of stop sequences. Output generation will be stopped if any stop sequence is encountered."""
|
|
94
129
|
|
|
95
130
|
# Random string (used concretely to bypass cache / see diverse results)
|
|
96
131
|
random: Optional[str] = None
|
|
132
|
+
"""Random seed (string), which guarantees reproducibility."""
|
|
97
133
|
|
|
98
|
-
# If true, for instances with multiple correct reference, the gold answer should be considered
|
|
99
|
-
# to be all of the correct references rather than any of the correct references.
|
|
100
134
|
multi_label: bool = False
|
|
135
|
+
"""If true, for instances with multiple correct reference, the gold answer should be considered to be all
|
|
136
|
+
of the correct references rather than any of the correct references."""
|
|
137
|
+
|
|
138
|
+
image_generation_parameters: Optional[ImageGenerationParameters] = None
|
|
139
|
+
"""Parameters for image generation."""
|
|
140
|
+
|
|
141
|
+
reeval_parameters: Optional[REEvalParameters] = None
|
|
142
|
+
"""Parameters for reeval evaluation."""
|
|
143
|
+
|
|
144
|
+
# Set hash=False to make `AdapterSpec` hashable
|
|
145
|
+
eval_splits: Optional[List[str]] = field(default=None, hash=False)
|
|
146
|
+
"""The splits from which evaluation instances will be drawn."""
|
|
147
|
+
|
|
148
|
+
output_mapping_pattern: Optional[str] = None
|
|
149
|
+
"""Pattern to apply to the output before applying the output mapping for the joint multiple choice adapter.
|
|
150
|
+
If the pattern has no group, the output mapping will be applied to the first match.
|
|
151
|
+
If the pattern has a group, the output mapping will be applied to the group of the first match."""
|
|
@@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
-
from helm.benchmark.adaptation.
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.benchmark.scenarios.scenario import Instance
|
|
7
7
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
8
8
|
from helm.benchmark.window_services.window_service import WindowService
|
|
@@ -22,7 +22,7 @@ class Adapter(ABC):
|
|
|
22
22
|
)
|
|
23
23
|
|
|
24
24
|
@abstractmethod
|
|
25
|
-
def adapt(self, instances: List[Instance], parallelism: int) ->
|
|
25
|
+
def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
|
|
26
26
|
"""
|
|
27
27
|
Takes a a list of `Instance`s and returns a `ScenarioState` with the
|
|
28
28
|
list of corresponding `RequestState`s.
|
|
@@ -1,31 +1,34 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from .multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
|
|
11
|
-
from .binary_ranking_adapter import BinaryRankingAdapter
|
|
12
|
-
from .multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
|
|
13
|
-
|
|
14
|
-
# Adaptation methods
|
|
15
|
-
ADAPT_GENERATION: str = "generation"
|
|
16
|
-
ADAPT_LANGUAGE_MODELING: str = "language_modeling"
|
|
17
|
-
ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
|
|
18
|
-
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
|
|
19
|
-
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
|
|
20
|
-
ADAPT_RANKING_BINARY: str = "ranking_binary"
|
|
21
|
-
|
|
22
|
-
ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
|
|
23
|
-
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
1
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
2
|
+
ADAPT_EHR_INSTRUCTION,
|
|
3
|
+
ADAPT_GENERATION,
|
|
4
|
+
ADAPT_CHAT,
|
|
5
|
+
ADAPT_GENERATION_MULTIMODAL,
|
|
6
|
+
ADAPT_LANGUAGE_MODELING,
|
|
7
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
8
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
|
|
9
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
|
|
24
10
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
11
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
12
|
+
ADAPT_RANKING_BINARY,
|
|
13
|
+
AdapterSpec,
|
|
14
|
+
)
|
|
15
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
16
|
+
from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
|
|
17
|
+
from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
|
|
18
|
+
from helm.benchmark.adaptation.adapters.chat_adapter import ChatAdapter
|
|
19
|
+
from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
|
|
20
|
+
from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
|
|
21
|
+
from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
|
|
22
|
+
MultipleChoiceJointMultimodalAdapter,
|
|
23
|
+
)
|
|
24
|
+
from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
|
|
25
|
+
from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
|
|
26
|
+
from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
|
|
27
|
+
MultipleChoiceJointChainOfThoughtAdapter,
|
|
28
|
+
)
|
|
29
|
+
from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
|
|
30
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
31
|
+
from helm.benchmark.adaptation.adapters.ehr_instruction_adapter import EHRInstructionAdapter
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
class AdapterFactory:
|
|
@@ -37,12 +40,18 @@ class AdapterFactory:
|
|
|
37
40
|
method: str = adapter_spec.method
|
|
38
41
|
adapter: Adapter
|
|
39
42
|
|
|
40
|
-
if method ==
|
|
43
|
+
if method == ADAPT_EHR_INSTRUCTION:
|
|
44
|
+
adapter = EHRInstructionAdapter(adapter_spec, tokenizer_service)
|
|
45
|
+
elif method == ADAPT_GENERATION:
|
|
41
46
|
adapter = GenerationAdapter(adapter_spec, tokenizer_service)
|
|
47
|
+
elif method == ADAPT_CHAT:
|
|
48
|
+
adapter = ChatAdapter(adapter_spec, tokenizer_service)
|
|
42
49
|
elif method == ADAPT_LANGUAGE_MODELING:
|
|
43
50
|
adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
|
|
44
51
|
elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
45
52
|
adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
|
|
53
|
+
elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|
|
54
|
+
adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
|
|
46
55
|
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
|
|
47
56
|
adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
|
|
48
57
|
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
|
|
@@ -51,6 +60,8 @@ class AdapterFactory:
|
|
|
51
60
|
adapter = BinaryRankingAdapter(adapter_spec, tokenizer_service)
|
|
52
61
|
elif method == ADAPT_GENERATION_MULTIMODAL:
|
|
53
62
|
adapter = GenerationMultimodalAdapter(adapter_spec, tokenizer_service)
|
|
63
|
+
elif method == ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL:
|
|
64
|
+
adapter = MultipleChoiceJointMultimodalAdapter(adapter_spec, tokenizer_service)
|
|
54
65
|
else:
|
|
55
66
|
raise ValueError(f"Invalid adaptation method: {method}")
|
|
56
67
|
|
|
@@ -3,7 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.scenarios.scenario import Instance, Reference, TRAIN_SPLIT, EVAL_SPLITS, CORRECT_TAG
|
|
5
5
|
from helm.common.request import Request
|
|
6
|
-
from .in_context_learning_adapter import InContextLearningAdapter
|
|
6
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class BinaryRankingAdapter(InContextLearningAdapter):
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Instance
|
|
5
|
+
from helm.common.request import Request
|
|
6
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ChatAdapter(InContextLearningAdapter):
|
|
10
|
+
"""
|
|
11
|
+
Each `Instance` in a `Scenario` has a history of the format:
|
|
12
|
+
|
|
13
|
+
[
|
|
14
|
+
{"role": "user", "content": <user-content>},
|
|
15
|
+
{"role": "assistant", "content": <assistant-content>},
|
|
16
|
+
{"role": "user", "content": <user-content>},
|
|
17
|
+
...
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def generate_requests(
|
|
23
|
+
self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
|
|
24
|
+
) -> List[RequestState]:
|
|
25
|
+
if eval_instance.input.messages is None:
|
|
26
|
+
raise ValueError("ChatAdapter requires input.messages of instances to be non-empty")
|
|
27
|
+
request = Request(
|
|
28
|
+
model=self.adapter_spec.model,
|
|
29
|
+
model_deployment=self.adapter_spec.model_deployment,
|
|
30
|
+
messages=eval_instance.input.messages,
|
|
31
|
+
num_completions=self.adapter_spec.num_outputs,
|
|
32
|
+
temperature=self.adapter_spec.temperature,
|
|
33
|
+
max_tokens=self.adapter_spec.max_tokens,
|
|
34
|
+
stop_sequences=self.adapter_spec.stop_sequences,
|
|
35
|
+
random=self.adapter_spec.random,
|
|
36
|
+
image_generation_parameters=self.adapter_spec.image_generation_parameters,
|
|
37
|
+
)
|
|
38
|
+
request_state = RequestState(
|
|
39
|
+
instance=eval_instance,
|
|
40
|
+
reference_index=None,
|
|
41
|
+
request_mode=None,
|
|
42
|
+
train_trial_index=train_trial_index,
|
|
43
|
+
output_mapping=None,
|
|
44
|
+
request=request,
|
|
45
|
+
result=None,
|
|
46
|
+
num_train_instances=0,
|
|
47
|
+
prompt_truncated=False,
|
|
48
|
+
)
|
|
49
|
+
return [request_state]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
|
|
4
|
+
from helm.benchmark.adaptation.prompt import Prompt
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, Instance
|
|
7
|
+
from helm.benchmark.window_services.window_service import EncodeResult
|
|
8
|
+
from helm.common.tokenization_request import TokenizationToken
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# in the prompt templates for EHR instructions, this is the placeholder for the EHR part
|
|
12
|
+
# which we use to compute accurate tokenized sequence lengths
|
|
13
|
+
PROMPT_TEMPLATE_EHR_PLACEHOLDER = "{ehr}"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EHRInstructionAdapter(GenerationAdapter):
|
|
17
|
+
"""
|
|
18
|
+
Each instance consists of the following:
|
|
19
|
+
|
|
20
|
+
EHRInstructionInput:
|
|
21
|
+
question: the question to answer or instruction to follow
|
|
22
|
+
ehr: the XML-tagged EHR to use as context to answer the question
|
|
23
|
+
prompt_template: a string template for how to combine the question + ehr
|
|
24
|
+
|
|
25
|
+
Reference output:
|
|
26
|
+
text: the 'golden' clinician response to the question
|
|
27
|
+
|
|
28
|
+
This Adapter combines the above into RequestStates with logic to truncate the EHR specifically
|
|
29
|
+
to fit in the context window with enough room for the instruction/question and the specified
|
|
30
|
+
amount of generated tokens.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
|
|
34
|
+
"""
|
|
35
|
+
Main adaptation method which takes all instances and turns them into `RequestState` objects.
|
|
36
|
+
"""
|
|
37
|
+
# sanity check, since for now we assume that there are no training instances at all
|
|
38
|
+
if any(instance.split == TRAIN_SPLIT for instance in instances):
|
|
39
|
+
raise RuntimeError(f"Got train instances for {self.__class__.__name__} - expected only eval instances.")
|
|
40
|
+
|
|
41
|
+
# use superclass implementation here
|
|
42
|
+
return super().adapt(instances, parallelism)
|
|
43
|
+
|
|
44
|
+
def construct_prompt(
|
|
45
|
+
self,
|
|
46
|
+
train_instances: List[Instance], # unused
|
|
47
|
+
eval_instance: Instance,
|
|
48
|
+
include_output: bool, # unused
|
|
49
|
+
reference_index: Optional[int], # unused
|
|
50
|
+
) -> Prompt:
|
|
51
|
+
"""
|
|
52
|
+
Uses the instance to construct a prompt for a given eval instance.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
eval_instance: Instance
|
|
57
|
+
the instance we wish to use to construct the prompt
|
|
58
|
+
"""
|
|
59
|
+
# start by simply getting the inputs
|
|
60
|
+
question = eval_instance.input.text
|
|
61
|
+
assert eval_instance.extra_data is not None
|
|
62
|
+
ehr_text: str = eval_instance.extra_data["ehr"]
|
|
63
|
+
prompt_template: str = eval_instance.extra_data["prompt_template"]
|
|
64
|
+
full_prompt_text = prompt_template.format(question=question, ehr=ehr_text)
|
|
65
|
+
|
|
66
|
+
# insert the question and see how many tokens we have so far
|
|
67
|
+
prompt_with_instr_no_ehr_placeholder = prompt_template.format(question=question, ehr="")
|
|
68
|
+
num_tokens_no_ehr = self.window_service.get_num_tokens(prompt_with_instr_no_ehr_placeholder)
|
|
69
|
+
|
|
70
|
+
# number of tokens we can allow the EHR part to be
|
|
71
|
+
target_ehr_num_tokens = (
|
|
72
|
+
self.window_service.max_request_length - self.adapter_spec.max_tokens - num_tokens_no_ehr
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# round-trip tokenization to get the correct token length we need
|
|
76
|
+
# NOTE: we truncate from the left side so that the most recent pieces of the EHR are included in the context
|
|
77
|
+
# as opposed to the canonical way of truncating from the right. This is done to match the MedAlign method.
|
|
78
|
+
full_ehr_tokens: EncodeResult = self.window_service.encode(ehr_text, max_length=None, truncation=False)
|
|
79
|
+
truncated_ehr_tokens: List[TokenizationToken] = full_ehr_tokens.tokens[-target_ehr_num_tokens:]
|
|
80
|
+
ehr_truncated: str
|
|
81
|
+
ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
|
|
82
|
+
|
|
83
|
+
# create the truncated prompt
|
|
84
|
+
truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
|
|
85
|
+
num_truncations = 1
|
|
86
|
+
while (
|
|
87
|
+
num_extra_tokens := self.adapter_spec.max_tokens
|
|
88
|
+
+ self.window_service.get_num_tokens(truncated_prompt_text)
|
|
89
|
+
- self.window_service.max_request_length
|
|
90
|
+
) > 0:
|
|
91
|
+
truncated_ehr_tokens = truncated_ehr_tokens[num_extra_tokens:]
|
|
92
|
+
ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
|
|
93
|
+
truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
|
|
94
|
+
num_truncations += 1
|
|
95
|
+
|
|
96
|
+
# naively construct the full non-truncated prompt
|
|
97
|
+
prompt = Prompt(
|
|
98
|
+
global_prefix=self.adapter_spec.global_prefix,
|
|
99
|
+
global_suffix=self.adapter_spec.global_suffix,
|
|
100
|
+
instance_prefix=self.adapter_spec.instance_prefix,
|
|
101
|
+
substitutions=self.adapter_spec.substitutions,
|
|
102
|
+
instructions_block=self.adapter_spec.instructions,
|
|
103
|
+
train_instance_blocks=[],
|
|
104
|
+
eval_instance_block=full_prompt_text,
|
|
105
|
+
truncated_text=truncated_prompt_text,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return prompt
|
|
@@ -4,7 +4,7 @@ from helm.benchmark.adaptation.prompt import Prompt
|
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
5
|
from helm.benchmark.scenarios.scenario import Instance
|
|
6
6
|
from helm.common.request import Request
|
|
7
|
-
from .in_context_learning_adapter import InContextLearningAdapter
|
|
7
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class GenerationAdapter(InContextLearningAdapter):
|
|
@@ -46,6 +46,7 @@ class GenerationAdapter(InContextLearningAdapter):
|
|
|
46
46
|
max_tokens=self.adapter_spec.max_tokens,
|
|
47
47
|
stop_sequences=self.adapter_spec.stop_sequences,
|
|
48
48
|
random=self.adapter_spec.random,
|
|
49
|
+
image_generation_parameters=self.adapter_spec.image_generation_parameters,
|
|
49
50
|
)
|
|
50
51
|
request_state = RequestState(
|
|
51
52
|
instance=eval_instance,
|
|
@@ -7,11 +7,11 @@ from typing import List, Dict, Optional
|
|
|
7
7
|
|
|
8
8
|
from helm.benchmark.adaptation.prompt import Prompt
|
|
9
9
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
10
|
-
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
11
10
|
from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS, Reference
|
|
12
11
|
from helm.common.general import parallel_map
|
|
13
|
-
from helm.common.
|
|
14
|
-
from .
|
|
12
|
+
from helm.common.request import Request
|
|
13
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn
|
|
14
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class InContextLearningAdapter(Adapter, ABC):
|
|
@@ -30,7 +30,7 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
30
30
|
pass
|
|
31
31
|
|
|
32
32
|
@htrack(None)
|
|
33
|
-
def adapt(self, instances: List[Instance], parallelism: int) ->
|
|
33
|
+
def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
|
|
34
34
|
"""
|
|
35
35
|
Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
|
|
36
36
|
The reason we don't do this per eval instance is that we create a common set of
|
|
@@ -39,8 +39,8 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
39
39
|
# Pick out training instances
|
|
40
40
|
all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
|
|
41
41
|
if len(all_train_instances) < self.adapter_spec.max_train_instances:
|
|
42
|
-
|
|
43
|
-
f"
|
|
42
|
+
hwarn(
|
|
43
|
+
f"only {len(all_train_instances)} training instances, "
|
|
44
44
|
f"wanted {self.adapter_spec.max_train_instances}"
|
|
45
45
|
)
|
|
46
46
|
|
|
@@ -64,7 +64,7 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
64
64
|
)
|
|
65
65
|
|
|
66
66
|
hlog(f"{len(all_request_states)} requests")
|
|
67
|
-
return
|
|
67
|
+
return all_request_states
|
|
68
68
|
|
|
69
69
|
def _adapt_trial_index(
|
|
70
70
|
self,
|
|
@@ -101,7 +101,23 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
101
101
|
hlog(line)
|
|
102
102
|
|
|
103
103
|
# Flatten and return
|
|
104
|
-
|
|
104
|
+
all_request_states: List[RequestState] = [request_state for result in results for request_state in result]
|
|
105
|
+
return self._add_trials(all_request_states)
|
|
106
|
+
|
|
107
|
+
def _add_trials(self, request_states: List[RequestState]) -> List[RequestState]:
|
|
108
|
+
"""Expand the request states by adding trials."""
|
|
109
|
+
if self.adapter_spec.num_trials <= 1:
|
|
110
|
+
return request_states
|
|
111
|
+
|
|
112
|
+
all_request_states: List[RequestState] = request_states.copy()
|
|
113
|
+
for i in range(1, self.adapter_spec.num_trials):
|
|
114
|
+
seed: str = str(i)
|
|
115
|
+
for request_state in request_states:
|
|
116
|
+
request: Request = replace(request_state.request, random=seed)
|
|
117
|
+
all_request_states.append(replace(request_state, request=request))
|
|
118
|
+
|
|
119
|
+
assert len(all_request_states) == len(request_states) * self.adapter_spec.num_trials
|
|
120
|
+
return all_request_states
|
|
105
121
|
|
|
106
122
|
def sample_examples(
|
|
107
123
|
self, all_train_instances: List[Instance], seed: int, sample_train: bool = True
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
from typing import List, Tuple, Optional
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
-
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
5
4
|
from helm.benchmark.scenarios.scenario import Instance, EVAL_SPLITS
|
|
6
5
|
from helm.benchmark.window_services.window_service import EncodeResult
|
|
7
6
|
from helm.common.general import flatten_list, parallel_map
|
|
8
7
|
from helm.common.hierarchical_logger import hlog, htrack
|
|
9
8
|
from helm.common.request import Request
|
|
10
9
|
from helm.common.tokenization_request import TokenizationToken
|
|
11
|
-
from .adapter import Adapter
|
|
10
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class LanguageModelingAdapter(Adapter):
|
|
@@ -26,7 +25,7 @@ class LanguageModelingAdapter(Adapter):
|
|
|
26
25
|
"""
|
|
27
26
|
|
|
28
27
|
@htrack(None)
|
|
29
|
-
def adapt(self, instances: List[Instance], parallelism: int) ->
|
|
28
|
+
def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
|
|
30
29
|
"""
|
|
31
30
|
Takes a list of `Instance`s and builds a list of corresponding `RequestState`s.
|
|
32
31
|
Only requires eval instances.
|
|
@@ -46,7 +45,7 @@ class LanguageModelingAdapter(Adapter):
|
|
|
46
45
|
)
|
|
47
46
|
hlog(f"{len(all_request_states)} requests")
|
|
48
47
|
|
|
49
|
-
return
|
|
48
|
+
return all_request_states
|
|
50
49
|
|
|
51
50
|
def _generate_requests(self, eval_instance: Instance) -> List[RequestState]:
|
|
52
51
|
"""
|
|
@@ -3,8 +3,10 @@ from typing import List
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.scenarios.scenario import Instance
|
|
5
5
|
from helm.common.request import Request
|
|
6
|
-
from .in_context_learning_multimodal_adapter import
|
|
7
|
-
|
|
6
|
+
from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
|
|
7
|
+
InContextLearningMultimodalAdapter,
|
|
8
|
+
)
|
|
9
|
+
from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class GenerationMultimodalAdapter(InContextLearningMultimodalAdapter):
|
|
@@ -8,7 +8,7 @@ from helm.common.hierarchical_logger import hlog
|
|
|
8
8
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
9
9
|
from helm.common.request import Request
|
|
10
10
|
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
11
|
-
from .multimodal_prompt import MultimodalPrompt
|
|
11
|
+
from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
|
|
@@ -79,6 +79,7 @@ class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
|
|
|
79
79
|
# Prompt
|
|
80
80
|
prompt = MultimodalPrompt(
|
|
81
81
|
global_prefix=self.adapter_spec.global_prefix,
|
|
82
|
+
global_suffix=self.adapter_spec.global_suffix,
|
|
82
83
|
instructions=self.adapter_spec.instructions,
|
|
83
84
|
train_instance_blocks=train_instance_blocks,
|
|
84
85
|
eval_instance_block=eval_instance_block,
|
|
@@ -11,6 +11,9 @@ class MultimodalPrompt:
|
|
|
11
11
|
# Global prefix, carried over from `AdapterSpec`
|
|
12
12
|
global_prefix: str
|
|
13
13
|
|
|
14
|
+
# Global suffix, carried over from `AdapterSpec`
|
|
15
|
+
global_suffix: str
|
|
16
|
+
|
|
14
17
|
# Instance prefix, carried over from `AdapterSpec`. What goes between the instruction and instances.
|
|
15
18
|
instance_prefix: str
|
|
16
19
|
|
|
@@ -47,6 +50,10 @@ class MultimodalPrompt:
|
|
|
47
50
|
if self.global_prefix:
|
|
48
51
|
result = result.add_textual_prefix(self.global_prefix)
|
|
49
52
|
|
|
53
|
+
# Add the global prefix if one exists
|
|
54
|
+
if self.global_suffix:
|
|
55
|
+
result = result.add_textual_suffix(self.global_suffix)
|
|
56
|
+
|
|
50
57
|
return result
|
|
51
58
|
|
|
52
59
|
@property
|