crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -2,16 +2,19 @@ from dataclasses import dataclass, field
|
|
|
2
2
|
from typing import List, Optional
|
|
3
3
|
|
|
4
4
|
from helm.common.image_generation_parameters import ImageGenerationParameters
|
|
5
|
+
from helm.common.reeval_parameters import REEvalParameters
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
# Adaptation methods
|
|
8
9
|
ADAPT_GENERATION: str = "generation"
|
|
10
|
+
ADAPT_CHAT: str = "chat"
|
|
9
11
|
ADAPT_LANGUAGE_MODELING: str = "language_modeling"
|
|
10
12
|
ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
|
|
13
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
|
|
11
14
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
|
|
12
15
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
|
|
13
16
|
ADAPT_RANKING_BINARY: str = "ranking_binary"
|
|
14
|
-
|
|
17
|
+
ADAPT_EHR_INSTRUCTION: str = "ehr_instruction"
|
|
15
18
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
|
|
16
19
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
17
20
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
@@ -63,6 +66,12 @@ class AdapterSpec:
|
|
|
63
66
|
reference_suffix: str = "\n"
|
|
64
67
|
"""The string that is included after each reference (for multiple-choice questions)."""
|
|
65
68
|
|
|
69
|
+
chain_of_thought_prefix: str = ""
|
|
70
|
+
"""The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
|
|
71
|
+
|
|
72
|
+
chain_of_thought_suffix: str = "\n"
|
|
73
|
+
"""The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
|
|
74
|
+
|
|
66
75
|
output_prefix: str = "Output: "
|
|
67
76
|
"""The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
|
|
68
77
|
|
|
@@ -124,6 +133,9 @@ class AdapterSpec:
|
|
|
124
133
|
image_generation_parameters: Optional[ImageGenerationParameters] = None
|
|
125
134
|
"""Parameters for image generation."""
|
|
126
135
|
|
|
136
|
+
reeval_parameters: Optional[REEvalParameters] = None
|
|
137
|
+
"""Parameters for reeval evaluation."""
|
|
138
|
+
|
|
127
139
|
# Set hash=False to make `AdapterSpec` hashable
|
|
128
140
|
eval_splits: Optional[List[str]] = field(default=None, hash=False)
|
|
129
141
|
"""The splits from which evaluation instances will be drawn."""
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
from helm.benchmark.adaptation.adapter_spec import (
|
|
2
|
+
ADAPT_EHR_INSTRUCTION,
|
|
2
3
|
ADAPT_GENERATION,
|
|
4
|
+
ADAPT_CHAT,
|
|
3
5
|
ADAPT_GENERATION_MULTIMODAL,
|
|
4
6
|
ADAPT_LANGUAGE_MODELING,
|
|
5
7
|
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
8
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
|
|
6
9
|
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
|
|
7
10
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
8
11
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
@@ -12,6 +15,7 @@ from helm.benchmark.adaptation.adapter_spec import (
|
|
|
12
15
|
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
13
16
|
from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
|
|
14
17
|
from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
|
|
18
|
+
from helm.benchmark.adaptation.adapters.chat_adapter import ChatAdapter
|
|
15
19
|
from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
|
|
16
20
|
from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
|
|
17
21
|
from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
|
|
@@ -19,8 +23,12 @@ from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimo
|
|
|
19
23
|
)
|
|
20
24
|
from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
|
|
21
25
|
from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
|
|
26
|
+
from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
|
|
27
|
+
MultipleChoiceJointChainOfThoughtAdapter,
|
|
28
|
+
)
|
|
22
29
|
from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
|
|
23
30
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
31
|
+
from helm.benchmark.adaptation.adapters.ehr_instruction_adapter import EHRInstructionAdapter
|
|
24
32
|
|
|
25
33
|
|
|
26
34
|
class AdapterFactory:
|
|
@@ -32,12 +40,18 @@ class AdapterFactory:
|
|
|
32
40
|
method: str = adapter_spec.method
|
|
33
41
|
adapter: Adapter
|
|
34
42
|
|
|
35
|
-
if method ==
|
|
43
|
+
if method == ADAPT_EHR_INSTRUCTION:
|
|
44
|
+
adapter = EHRInstructionAdapter(adapter_spec, tokenizer_service)
|
|
45
|
+
elif method == ADAPT_GENERATION:
|
|
36
46
|
adapter = GenerationAdapter(adapter_spec, tokenizer_service)
|
|
47
|
+
elif method == ADAPT_CHAT:
|
|
48
|
+
adapter = ChatAdapter(adapter_spec, tokenizer_service)
|
|
37
49
|
elif method == ADAPT_LANGUAGE_MODELING:
|
|
38
50
|
adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
|
|
39
51
|
elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
40
52
|
adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
|
|
53
|
+
elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|
|
54
|
+
adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
|
|
41
55
|
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
|
|
42
56
|
adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
|
|
43
57
|
elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
|
|
@@ -3,7 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.scenarios.scenario import Instance, Reference, TRAIN_SPLIT, EVAL_SPLITS, CORRECT_TAG
|
|
5
5
|
from helm.common.request import Request
|
|
6
|
-
from .in_context_learning_adapter import InContextLearningAdapter
|
|
6
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class BinaryRankingAdapter(InContextLearningAdapter):
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Instance
|
|
5
|
+
from helm.common.request import Request
|
|
6
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ChatAdapter(InContextLearningAdapter):
|
|
10
|
+
"""
|
|
11
|
+
Each `Instance` in a `Scenario` has a history of the format:
|
|
12
|
+
|
|
13
|
+
[
|
|
14
|
+
{"role": "user", "content": <user-content>},
|
|
15
|
+
{"role": "assistant", "content": <assistant-content>},
|
|
16
|
+
{"role": "user", "content": <user-content>},
|
|
17
|
+
...
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def generate_requests(
|
|
23
|
+
self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
|
|
24
|
+
) -> List[RequestState]:
|
|
25
|
+
if eval_instance.input.messages is None:
|
|
26
|
+
raise ValueError("ChatAdapter requires input.messages of instances to be non-empty")
|
|
27
|
+
request = Request(
|
|
28
|
+
model=self.adapter_spec.model,
|
|
29
|
+
model_deployment=self.adapter_spec.model_deployment,
|
|
30
|
+
messages=eval_instance.input.messages,
|
|
31
|
+
num_completions=self.adapter_spec.num_outputs,
|
|
32
|
+
temperature=self.adapter_spec.temperature,
|
|
33
|
+
max_tokens=self.adapter_spec.max_tokens,
|
|
34
|
+
stop_sequences=self.adapter_spec.stop_sequences,
|
|
35
|
+
random=self.adapter_spec.random,
|
|
36
|
+
image_generation_parameters=self.adapter_spec.image_generation_parameters,
|
|
37
|
+
)
|
|
38
|
+
request_state = RequestState(
|
|
39
|
+
instance=eval_instance,
|
|
40
|
+
reference_index=None,
|
|
41
|
+
request_mode=None,
|
|
42
|
+
train_trial_index=train_trial_index,
|
|
43
|
+
output_mapping=None,
|
|
44
|
+
request=request,
|
|
45
|
+
result=None,
|
|
46
|
+
num_train_instances=0,
|
|
47
|
+
prompt_truncated=False,
|
|
48
|
+
)
|
|
49
|
+
return [request_state]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
|
|
4
|
+
from helm.benchmark.adaptation.prompt import Prompt
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, Instance
|
|
7
|
+
from helm.benchmark.window_services.window_service import EncodeResult
|
|
8
|
+
from helm.common.tokenization_request import TokenizationToken
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# in the prompt templates for EHR instructions, this is the placeholder for the EHR part
|
|
12
|
+
# which we use to compute accurate tokenized sequence lengths
|
|
13
|
+
PROMPT_TEMPLATE_EHR_PLACEHOLDER = "{ehr}"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EHRInstructionAdapter(GenerationAdapter):
|
|
17
|
+
"""
|
|
18
|
+
Each instance consists of the following:
|
|
19
|
+
|
|
20
|
+
EHRInstructionInput:
|
|
21
|
+
question: the question to answer or instruction to follow
|
|
22
|
+
ehr: the XML-tagged EHR to use as context to answer the question
|
|
23
|
+
prompt_template: a string template for how to combine the question + ehr
|
|
24
|
+
|
|
25
|
+
Reference output:
|
|
26
|
+
text: the 'golden' clinician response to the question
|
|
27
|
+
|
|
28
|
+
This Adapter combines the above into RequestStates with logic to truncate the EHR specifically
|
|
29
|
+
to fit in the context window with enough room for the instruction/question and the specified
|
|
30
|
+
amount of generated tokens.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
|
|
34
|
+
"""
|
|
35
|
+
Main adaptation method which takes all instances and turns them into `RequestState` objects.
|
|
36
|
+
"""
|
|
37
|
+
# sanity check, since for now we assume that there are no training instances at all
|
|
38
|
+
if any(instance.split == TRAIN_SPLIT for instance in instances):
|
|
39
|
+
raise RuntimeError(f"Got train instances for {self.__class__.__name__} - expected only eval instances.")
|
|
40
|
+
|
|
41
|
+
# use superclass implementation here
|
|
42
|
+
return super().adapt(instances, parallelism)
|
|
43
|
+
|
|
44
|
+
def construct_prompt(
|
|
45
|
+
self,
|
|
46
|
+
train_instances: List[Instance], # unused
|
|
47
|
+
eval_instance: Instance,
|
|
48
|
+
include_output: bool, # unused
|
|
49
|
+
reference_index: Optional[int], # unused
|
|
50
|
+
) -> Prompt:
|
|
51
|
+
"""
|
|
52
|
+
Uses the instance to construct a prompt for a given eval instance.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
eval_instance: Instance
|
|
57
|
+
the instance we wish to use to construct the prompt
|
|
58
|
+
"""
|
|
59
|
+
# start by simply getting the inputs
|
|
60
|
+
question = eval_instance.input.text
|
|
61
|
+
assert eval_instance.extra_data is not None
|
|
62
|
+
ehr_text: str = eval_instance.extra_data["ehr"]
|
|
63
|
+
prompt_template: str = eval_instance.extra_data["prompt_template"]
|
|
64
|
+
full_prompt_text = prompt_template.format(question=question, ehr=ehr_text)
|
|
65
|
+
|
|
66
|
+
# insert the question and see how many tokens we have so far
|
|
67
|
+
prompt_with_instr_no_ehr_placeholder = prompt_template.format(question=question, ehr="")
|
|
68
|
+
num_tokens_no_ehr = self.window_service.get_num_tokens(prompt_with_instr_no_ehr_placeholder)
|
|
69
|
+
|
|
70
|
+
# number of tokens we can allow the EHR part to be
|
|
71
|
+
target_ehr_num_tokens = (
|
|
72
|
+
self.window_service.max_request_length - self.adapter_spec.max_tokens - num_tokens_no_ehr
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# round-trip tokenization to get the correct token length we need
|
|
76
|
+
# NOTE: we truncate from the left side so that the most recent pieces of the EHR are included in the context
|
|
77
|
+
# as opposed to the canonical way of truncating from the right. This is done to match the MedAlign method.
|
|
78
|
+
full_ehr_tokens: EncodeResult = self.window_service.encode(ehr_text, max_length=None, truncation=False)
|
|
79
|
+
truncated_ehr_tokens: List[TokenizationToken] = full_ehr_tokens.tokens[-target_ehr_num_tokens:]
|
|
80
|
+
ehr_truncated: str
|
|
81
|
+
ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
|
|
82
|
+
|
|
83
|
+
# create the truncated prompt
|
|
84
|
+
truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
|
|
85
|
+
num_truncations = 1
|
|
86
|
+
while (
|
|
87
|
+
num_extra_tokens := self.adapter_spec.max_tokens
|
|
88
|
+
+ self.window_service.get_num_tokens(truncated_prompt_text)
|
|
89
|
+
- self.window_service.max_request_length
|
|
90
|
+
) > 0:
|
|
91
|
+
truncated_ehr_tokens = truncated_ehr_tokens[num_extra_tokens:]
|
|
92
|
+
ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
|
|
93
|
+
truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
|
|
94
|
+
num_truncations += 1
|
|
95
|
+
|
|
96
|
+
# naively construct the full non-truncated prompt
|
|
97
|
+
prompt = Prompt(
|
|
98
|
+
global_prefix=self.adapter_spec.global_prefix,
|
|
99
|
+
global_suffix=self.adapter_spec.global_suffix,
|
|
100
|
+
instance_prefix=self.adapter_spec.instance_prefix,
|
|
101
|
+
substitutions=self.adapter_spec.substitutions,
|
|
102
|
+
instructions_block=self.adapter_spec.instructions,
|
|
103
|
+
train_instance_blocks=[],
|
|
104
|
+
eval_instance_block=full_prompt_text,
|
|
105
|
+
truncated_text=truncated_prompt_text,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return prompt
|
|
@@ -4,7 +4,7 @@ from helm.benchmark.adaptation.prompt import Prompt
|
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
5
|
from helm.benchmark.scenarios.scenario import Instance
|
|
6
6
|
from helm.common.request import Request
|
|
7
|
-
from .in_context_learning_adapter import InContextLearningAdapter
|
|
7
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class GenerationAdapter(InContextLearningAdapter):
|
|
@@ -11,7 +11,7 @@ from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
|
|
|
11
11
|
from helm.common.general import parallel_map
|
|
12
12
|
from helm.common.request import Request
|
|
13
13
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
14
|
-
from .adapter import Adapter
|
|
14
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class InContextLearningAdapter(Adapter, ABC):
|
|
@@ -7,7 +7,7 @@ from helm.common.general import flatten_list, parallel_map
|
|
|
7
7
|
from helm.common.hierarchical_logger import hlog, htrack
|
|
8
8
|
from helm.common.request import Request
|
|
9
9
|
from helm.common.tokenization_request import TokenizationToken
|
|
10
|
-
from .adapter import Adapter
|
|
10
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class LanguageModelingAdapter(Adapter):
|
|
@@ -3,8 +3,10 @@ from typing import List
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.scenarios.scenario import Instance
|
|
5
5
|
from helm.common.request import Request
|
|
6
|
-
from .in_context_learning_multimodal_adapter import
|
|
7
|
-
|
|
6
|
+
from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
|
|
7
|
+
InContextLearningMultimodalAdapter,
|
|
8
|
+
)
|
|
9
|
+
from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class GenerationMultimodalAdapter(InContextLearningMultimodalAdapter):
|
|
@@ -8,7 +8,7 @@ from helm.common.hierarchical_logger import hlog
|
|
|
8
8
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
9
9
|
from helm.common.request import Request
|
|
10
10
|
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
11
|
-
from .multimodal_prompt import MultimodalPrompt
|
|
11
|
+
from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
|
|
@@ -8,7 +8,7 @@ from helm.common.request import Request
|
|
|
8
8
|
from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
|
|
9
9
|
InContextLearningMultimodalAdapter,
|
|
10
10
|
)
|
|
11
|
-
from .multimodal_prompt import MultimodalPrompt
|
|
11
|
+
from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class MultipleChoiceJointMultimodalAdapter(InContextLearningMultimodalAdapter, ABC):
|
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py
CHANGED
|
@@ -8,8 +8,10 @@ from helm.benchmark.scenarios.scenario import Instance, Reference, Input, Output
|
|
|
8
8
|
from helm.benchmark.window_services.test_utils import get_tokenizer_service
|
|
9
9
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
10
10
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION_MULTIMODAL, AdapterFactory
|
|
11
|
-
from .in_context_learning_multimodal_adapter import
|
|
12
|
-
|
|
11
|
+
from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
|
|
12
|
+
InContextLearningMultimodalAdapter,
|
|
13
|
+
)
|
|
14
|
+
from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
class TestInContextLearningMultimodalAdapter(unittest.TestCase):
|
|
@@ -2,7 +2,7 @@ from typing import List
|
|
|
2
2
|
import unittest
|
|
3
3
|
|
|
4
4
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
5
|
-
from .multimodal_prompt import MultimodalPrompt
|
|
5
|
+
from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class TestMultimodalContent(unittest.TestCase):
|
|
@@ -3,7 +3,7 @@ from typing import List
|
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
5
|
from helm.benchmark.scenarios.scenario import Instance, Input
|
|
6
|
-
from .multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
|
|
6
|
+
from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class MultipleChoiceCalibratedAdapter(MultipleChoiceSeparateAdapter):
|
|
@@ -3,7 +3,7 @@ from typing import List, Dict, Optional
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.scenarios.scenario import Instance
|
|
5
5
|
from helm.common.request import Request
|
|
6
|
-
from .in_context_learning_adapter import InContextLearningAdapter
|
|
6
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class MultipleChoiceJointAdapter(InContextLearningAdapter):
|
|
@@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
|
|
|
40
40
|
|
|
41
41
|
@staticmethod
|
|
42
42
|
def get_prefix_char(prefix: str) -> str:
|
|
43
|
-
return prefix.
|
|
43
|
+
return [char for char in prefix if char.isalnum()][0]
|
|
44
44
|
|
|
45
45
|
@staticmethod
|
|
46
46
|
def get_reference_prefix(prefix: str, i: int) -> str:
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.scenarios.scenario import Instance
|
|
4
|
+
from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter):
|
|
8
|
+
"""
|
|
9
|
+
Each `Instance` in a `Scenario` looks like this:
|
|
10
|
+
|
|
11
|
+
<input> -> <reference1>
|
|
12
|
+
<reference2>
|
|
13
|
+
<reference3> [correct]
|
|
14
|
+
<reference4>
|
|
15
|
+
|
|
16
|
+
<instance_chain_of_thought>
|
|
17
|
+
|
|
18
|
+
We can define a label (e.g., letter) for each reference:
|
|
19
|
+
|
|
20
|
+
<global_prefix>
|
|
21
|
+
<instructions>
|
|
22
|
+
<input_prefix>
|
|
23
|
+
<input> # train
|
|
24
|
+
<input_suffix>
|
|
25
|
+
A. <reference1>
|
|
26
|
+
B. <reference2>
|
|
27
|
+
C. <reference3>
|
|
28
|
+
D. <reference4>
|
|
29
|
+
<output_prefix>
|
|
30
|
+
<chain_of_thought_prefix>
|
|
31
|
+
<instance_chain_of_thought>
|
|
32
|
+
<chain_of_thought_suffix>
|
|
33
|
+
<output>
|
|
34
|
+
<output_suffix>
|
|
35
|
+
|
|
36
|
+
<input_prefix>
|
|
37
|
+
<input> # test
|
|
38
|
+
<input_suffix>
|
|
39
|
+
A. <reference1>
|
|
40
|
+
B. <reference2>
|
|
41
|
+
C. <reference3>
|
|
42
|
+
D. <reference4>
|
|
43
|
+
<output_prefix>
|
|
44
|
+
<chain_of_thought_prefix>
|
|
45
|
+
<instance_chain_of_thought>
|
|
46
|
+
<chain_of_thought_suffix>
|
|
47
|
+
<output>
|
|
48
|
+
<output_suffix>
|
|
49
|
+
<global_suffix>
|
|
50
|
+
|
|
51
|
+
In general, each example is:
|
|
52
|
+
|
|
53
|
+
<input_prefix><input><input_suffix><reference_prefixes[index]><reference> \
|
|
54
|
+
<output_prefix><chain_of_thought_prefix><chain_of_thought><chain_of_thought_suffix><output><output_suffix>
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str:
|
|
58
|
+
"""Return a list of lines corresponding to this example (part of the prompt)."""
|
|
59
|
+
# Input
|
|
60
|
+
result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix
|
|
61
|
+
|
|
62
|
+
# Include the references
|
|
63
|
+
delimiter = ", "
|
|
64
|
+
no_correct_references = "n/a"
|
|
65
|
+
output = no_correct_references
|
|
66
|
+
for reference_index, reference in enumerate(instance.references):
|
|
67
|
+
prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
|
|
68
|
+
result += prefix + reference.output.text + self.adapter_spec.reference_suffix
|
|
69
|
+
if reference.is_correct:
|
|
70
|
+
if output == no_correct_references:
|
|
71
|
+
output = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
|
|
72
|
+
elif self.adapter_spec.multi_label:
|
|
73
|
+
output += delimiter
|
|
74
|
+
output += self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
|
|
75
|
+
|
|
76
|
+
if include_output:
|
|
77
|
+
chain_of_thought = instance.extra_data.get("chain_of_thought", "") if instance.extra_data else ""
|
|
78
|
+
chain_of_thought_block = (
|
|
79
|
+
self.adapter_spec.chain_of_thought_prefix + chain_of_thought + self.adapter_spec.chain_of_thought_suffix
|
|
80
|
+
)
|
|
81
|
+
result += (
|
|
82
|
+
self.adapter_spec.output_prefix + chain_of_thought_block + output + self.adapter_spec.output_suffix
|
|
83
|
+
)
|
|
84
|
+
else:
|
|
85
|
+
result += self.adapter_spec.output_prefix.rstrip()
|
|
86
|
+
|
|
87
|
+
return result
|
|
@@ -4,7 +4,7 @@ from helm.benchmark.adaptation.prompt import Prompt
|
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
5
|
from helm.benchmark.scenarios.scenario import Instance
|
|
6
6
|
from helm.common.request import Request
|
|
7
|
-
from .in_context_learning_adapter import InContextLearningAdapter
|
|
7
|
+
from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class MultipleChoiceSeparateAdapter(InContextLearningAdapter):
|
|
@@ -14,9 +14,9 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
from helm.benchmark.run_specs.simple_run_specs import get_simple1_spec
|
|
15
15
|
from helm.benchmark.adaptation.prompt import Prompt
|
|
16
16
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
17
|
-
from .adapter_factory import AdapterFactory, ADAPT_GENERATION
|
|
18
|
-
from .generation_adapter import GenerationAdapter
|
|
19
|
-
from .test_adapter import TestAdapter
|
|
17
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_GENERATION
|
|
18
|
+
from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
|
|
19
|
+
from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class TestGenerationAdapter(TestAdapter):
|
|
@@ -5,8 +5,8 @@ from helm.common.tokenization_request import TokenizationToken
|
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.common.request import Request
|
|
7
7
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
|
-
from .adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
|
|
9
|
-
from .test_adapter import TestAdapter
|
|
8
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
|
|
9
|
+
from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
|
|
10
10
|
from helm.benchmark.scenarios.scenario import TEST_SPLIT, Instance, Input, Reference
|
|
11
11
|
|
|
12
12
|
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
from typing import List, Set
|
|
3
3
|
from helm.benchmark.scenarios.scenario import TEST_SPLIT, TRAIN_SPLIT, Instance, Input, Output, Reference, CORRECT_TAG
|
|
4
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
-
from .adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
|
-
from .test_adapter import TestAdapter
|
|
5
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
|
|
6
|
+
from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def _make_instance(
|
|
@@ -4,6 +4,7 @@ from helm.benchmark.adaptation.adapter_spec import (
|
|
|
4
4
|
ADAPT_GENERATION,
|
|
5
5
|
ADAPT_LANGUAGE_MODELING,
|
|
6
6
|
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
7
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
|
|
7
8
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
8
9
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
9
10
|
ADAPT_RANKING_BINARY,
|
|
@@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec(
|
|
|
43
44
|
[output_noun]:
|
|
44
45
|
"""
|
|
45
46
|
|
|
47
|
+
input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
|
|
48
|
+
input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
|
|
49
|
+
output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
|
|
50
|
+
output_suffix = kwargs.pop("output_suffix", "\n")
|
|
51
|
+
|
|
46
52
|
return AdapterSpec(
|
|
47
53
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
48
54
|
instructions=format_instructions(instructions),
|
|
49
|
-
input_prefix=
|
|
50
|
-
input_suffix=
|
|
51
|
-
output_prefix=
|
|
52
|
-
output_suffix=
|
|
55
|
+
input_prefix=input_prefix,
|
|
56
|
+
input_suffix=input_suffix,
|
|
57
|
+
output_prefix=output_prefix,
|
|
58
|
+
output_suffix=output_suffix,
|
|
59
|
+
max_train_instances=max_train_instances,
|
|
60
|
+
num_outputs=num_outputs,
|
|
61
|
+
max_tokens=max_tokens,
|
|
62
|
+
temperature=0.0,
|
|
63
|
+
stop_sequences=["\n"],
|
|
64
|
+
sample_train=sample_train,
|
|
65
|
+
**kwargs,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_multiple_choice_joint_chain_of_thought_adapter_spec(
|
|
70
|
+
instructions: str,
|
|
71
|
+
input_noun: Optional[str],
|
|
72
|
+
output_noun: str,
|
|
73
|
+
num_outputs: int = 5,
|
|
74
|
+
max_train_instances: int = 5,
|
|
75
|
+
max_tokens: int = 5,
|
|
76
|
+
sample_train: bool = True,
|
|
77
|
+
**kwargs,
|
|
78
|
+
) -> AdapterSpec:
|
|
79
|
+
"""
|
|
80
|
+
[instructions]
|
|
81
|
+
|
|
82
|
+
[input_noun]: [input]
|
|
83
|
+
[reference_1]
|
|
84
|
+
...
|
|
85
|
+
[reference_k]
|
|
86
|
+
[output_noun]: [output]
|
|
87
|
+
|
|
88
|
+
[input_noun]: [input]
|
|
89
|
+
[reference_1]
|
|
90
|
+
...
|
|
91
|
+
[reference_k]
|
|
92
|
+
[output_noun]:
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
|
|
96
|
+
input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
|
|
97
|
+
output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
|
|
98
|
+
output_suffix = kwargs.pop("output_suffix", "\n")
|
|
99
|
+
|
|
100
|
+
return AdapterSpec(
|
|
101
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
|
|
102
|
+
instructions=format_instructions(instructions),
|
|
103
|
+
input_prefix=input_prefix,
|
|
104
|
+
input_suffix=input_suffix,
|
|
105
|
+
output_prefix=output_prefix,
|
|
106
|
+
output_suffix=output_suffix,
|
|
53
107
|
max_train_instances=max_train_instances,
|
|
54
108
|
num_outputs=num_outputs,
|
|
55
109
|
max_tokens=max_tokens,
|
|
@@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec(
|
|
|
109
163
|
sample_train=sample_train,
|
|
110
164
|
**kwargs,
|
|
111
165
|
)
|
|
166
|
+
elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
|
|
167
|
+
return get_multiple_choice_joint_chain_of_thought_adapter_spec(
|
|
168
|
+
instructions,
|
|
169
|
+
input_noun,
|
|
170
|
+
output_noun,
|
|
171
|
+
max_train_instances=max_train_instances,
|
|
172
|
+
num_outputs=num_outputs,
|
|
173
|
+
max_tokens=max_tokens,
|
|
174
|
+
sample_train=sample_train,
|
|
175
|
+
**kwargs,
|
|
176
|
+
)
|
|
112
177
|
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
|
|
113
178
|
return get_multiple_choice_separate_adapter_spec(method, empty_input)
|
|
114
179
|
else:
|