crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from typing import List, Any
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ENEMChallengeScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
The Exame Nacional do Ensino Médio (ENEM) is an advanced High-School level exam widely applied
|
|
19
|
+
every year by the Brazilian government to students that wish to undertake a University degree.
|
|
20
|
+
|
|
21
|
+
The questions are about all types of intelectual fields and they are divided into four groups
|
|
22
|
+
that are named as: Humanities, Languages, Sciences and Mathematics.
|
|
23
|
+
|
|
24
|
+
This scenario is based on the exams that were applied throughout the years of 2009 and 2023.
|
|
25
|
+
|
|
26
|
+
The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/enem_challenge
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
name = "enem_challenge"
|
|
30
|
+
description = "ENEM Challenge dataset"
|
|
31
|
+
tags = ["knowledge", "multiple_choice", "pt-br"]
|
|
32
|
+
|
|
33
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
34
|
+
# Download the raw data and read all the dialogues
|
|
35
|
+
dataset: Any
|
|
36
|
+
# Read all the instances
|
|
37
|
+
instances: List[Instance] = []
|
|
38
|
+
cache_dir = str(Path(output_path) / "data")
|
|
39
|
+
|
|
40
|
+
dataset = load_dataset("eduagarcia/enem_challenge", cache_dir=cache_dir)
|
|
41
|
+
for example in dataset["train"]:
|
|
42
|
+
question = example["question"]
|
|
43
|
+
choices = example["choices"]
|
|
44
|
+
answer = example["answerKey"]
|
|
45
|
+
# Skipping every canceled question!
|
|
46
|
+
if answer == "ANULADO":
|
|
47
|
+
continue
|
|
48
|
+
answers_dict = dict(zip(choices["label"], choices["text"]))
|
|
49
|
+
correct_answer = answers_dict[answer]
|
|
50
|
+
|
|
51
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
52
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
53
|
+
|
|
54
|
+
instance = Instance(
|
|
55
|
+
input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
|
|
56
|
+
)
|
|
57
|
+
instances.append(instance)
|
|
58
|
+
return instances
|
|
@@ -5,7 +5,17 @@ from typing import List, Tuple
|
|
|
5
5
|
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
7
|
from helm.common.general import ensure_file_downloaded
|
|
8
|
-
from .scenario import
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
VALID_SPLIT,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
9
19
|
|
|
10
20
|
|
|
11
21
|
class EntityDataImputationScenario(Scenario):
|
|
@@ -4,8 +4,18 @@ from typing import Dict, List, Tuple
|
|
|
4
4
|
|
|
5
5
|
from helm.common.hierarchical_logger import hlog
|
|
6
6
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from .scenario import
|
|
8
|
-
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
|
|
9
19
|
|
|
10
20
|
|
|
11
21
|
class EntityMatchingScenario(Scenario):
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import random
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FinancialPhrasebankScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
A sentiment classification benchmark based on the dataset from Good Debt or Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., 2013)](https://arxiv.org/abs/1307.5336).
|
|
21
|
+
|
|
22
|
+
Context:
|
|
23
|
+
Polar sentiment dataset of sentences from financial news. The dataset consists of 4840 sentences from English
|
|
24
|
+
language financial news categorized by sentiment. The dataset is divided by agreement rate of 5-8 annotators.
|
|
25
|
+
|
|
26
|
+
This release of the financial phrase bank covers a collection of 4840 sentences. The selected collection of
|
|
27
|
+
phrases was annotated by 16 people with adequate background knowledge on financial markets.
|
|
28
|
+
|
|
29
|
+
Given the large number of overlapping annotations (5 to 8 annotations per sentence), there are several ways
|
|
30
|
+
to define a majority vote based gold standard. To provide an objective comparison, the paper authors have formed 4 alternative
|
|
31
|
+
reference datasets based on the strength of majority agreement: 100%, 75%, 66% and 50%.
|
|
32
|
+
|
|
33
|
+
Data source:
|
|
34
|
+
https://huggingface.co/datasets/takala/financial_phrasebank
|
|
35
|
+
|
|
36
|
+
Reference:
|
|
37
|
+
P. Malo, A. Sinha, P. Korhonen, J. Wallenius, and P. Takala, “Good debt or bad debt: Detecting semantic orientations in economic texts,” Journal of the Association for Information Science and Technology, vol. 65, 2014.
|
|
38
|
+
https://arxiv.org/pdf/1307.5336
|
|
39
|
+
|
|
40
|
+
""" # noqa: E501
|
|
41
|
+
|
|
42
|
+
name = "financial_phrasebank"
|
|
43
|
+
description = "The dataset consists of 4840 sentences from English \
|
|
44
|
+
language financial news categorized by sentiment."
|
|
45
|
+
tags = ["finance", "sentiment analysis", "classification"]
|
|
46
|
+
|
|
47
|
+
INSTRUCTIONS = """The dataset consists of sentences from English language financial news categorized by sentiment.
|
|
48
|
+
Classify the sentences into one of the 3 sentiment categories.
|
|
49
|
+
Possible labels:\n1. positive\n2. neutral\n3. negative""" # noqa: E501
|
|
50
|
+
DATASET_URL = "https://huggingface.co/datasets/takala/financial_phrasebank/resolve/598b6aad98f7c8d67be161b12a4b5f2497e07edd/data/FinancialPhraseBank-v1.0.zip" # noqa: E501
|
|
51
|
+
AGREEMENT_VALUES = [50, 66, 75, 100]
|
|
52
|
+
TRAIN_SPLIT_SIZE = 0.7
|
|
53
|
+
|
|
54
|
+
def __init__(self, agreement: int, random_seed: int = 121):
|
|
55
|
+
"""The initialization of an instance.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
subset: str: This argument is used to specify the ratio of annotators who agreed on the ground truth label.
|
|
59
|
+
The value must be one of the strings defined in
|
|
60
|
+
SUBSETS = ["sentences_allagree", "sentences_75agree", "sentences_66agree", "sentences_50agree"].
|
|
61
|
+
random_seed: int = 121: The random seed for sampling the train/test splits.
|
|
62
|
+
"""
|
|
63
|
+
super().__init__()
|
|
64
|
+
if agreement not in self.AGREEMENT_VALUES:
|
|
65
|
+
raise Exception(
|
|
66
|
+
f"Unknown `agreement` value: {agreement}, allowed values are {self.AGREEMENT_VALUES}".format(agreement)
|
|
67
|
+
)
|
|
68
|
+
self.agreement = agreement
|
|
69
|
+
self.random_seed = random_seed
|
|
70
|
+
|
|
71
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
72
|
+
data_parent_path = os.path.join(output_path, "data")
|
|
73
|
+
ensure_file_downloaded(
|
|
74
|
+
self.DATASET_URL,
|
|
75
|
+
data_parent_path,
|
|
76
|
+
unpack=True,
|
|
77
|
+
unpack_type="unzip",
|
|
78
|
+
)
|
|
79
|
+
file_name = "Sentences_AllAgree.txt" if self.agreement == 100 else f"Sentences_{self.agreement}Agree.txt"
|
|
80
|
+
data_file_path = os.path.join(data_parent_path, "FinancialPhraseBank-v1.0", file_name)
|
|
81
|
+
with open(data_file_path, mode="r", encoding="iso-8859-1") as f:
|
|
82
|
+
lines = list(f.readlines())
|
|
83
|
+
random.Random(self.random_seed).shuffle(lines)
|
|
84
|
+
train_split_index = int(len(lines) * self.TRAIN_SPLIT_SIZE)
|
|
85
|
+
instances: List[Instance] = []
|
|
86
|
+
for index, line in enumerate(lines):
|
|
87
|
+
sentence, label = line.strip().rsplit("@", 1)
|
|
88
|
+
instance = Instance(
|
|
89
|
+
input=Input(text=sentence),
|
|
90
|
+
references=[Reference(Output(text=label), tags=[CORRECT_TAG])],
|
|
91
|
+
split=TRAIN_SPLIT if index < train_split_index else TEST_SPLIT,
|
|
92
|
+
)
|
|
93
|
+
instances.append(instance)
|
|
94
|
+
return instances
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
import math
|
|
3
|
+
import os
|
|
4
|
+
import random
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from helm.benchmark.runner import TRAIN_SPLIT
|
|
10
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
11
|
+
from helm.benchmark.scenarios.scenario import (
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
Input,
|
|
15
|
+
Instance,
|
|
16
|
+
Reference,
|
|
17
|
+
Scenario,
|
|
18
|
+
Output,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class GoldCommodityNewsScenario(Scenario):
|
|
23
|
+
"""Gold commodity news headline classification
|
|
24
|
+
|
|
25
|
+
This dataset contains gold commodity news headlines annotated by humans labeled by humans with regards to
|
|
26
|
+
whether the news headline discusses past movements and expected directionality in prices, asset comparison
|
|
27
|
+
and other general information. The task is to classify the news headlines using these labels.
|
|
28
|
+
|
|
29
|
+
Paper: https://arxiv.org/abs/2009.04202
|
|
30
|
+
Dataset: https://www.kaggle.com/datasets/daittan/gold-commodity-news-and-dimensions
|
|
31
|
+
|
|
32
|
+
Citation:
|
|
33
|
+
Ankur Sinha, Tanmay Khandait
|
|
34
|
+
"Impact of News on the Commodity Market: Dataset and Results." arXiv preprint arXiv:2009.04202 (2020)"""
|
|
35
|
+
|
|
36
|
+
name = "gold_commodity_news"
|
|
37
|
+
description = "The dataset is a collection of news items related to the gold commodities from various sources."
|
|
38
|
+
|
|
39
|
+
tags = ["news headline", "classification"]
|
|
40
|
+
|
|
41
|
+
CATEGORY_COLUMN_NAMES = {
|
|
42
|
+
"price_or_not": "Price or Not",
|
|
43
|
+
"direction_up": "Direction Up",
|
|
44
|
+
"direction_constant": "Direction Constant",
|
|
45
|
+
"direction_down": "Direction Down",
|
|
46
|
+
"past_price": "PastPrice",
|
|
47
|
+
"future_price": "FuturePrice",
|
|
48
|
+
"past_news": "PastNews",
|
|
49
|
+
"future_news": "FutureNews",
|
|
50
|
+
"assert_comparison": "Asset Comparision",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
CATEGORY_INSTRUCTIONS = {
|
|
54
|
+
"price_or_not": "the gold price",
|
|
55
|
+
"direction_up": "the gold price heading up",
|
|
56
|
+
"direction_constant": "the price remaining constant or stable",
|
|
57
|
+
"direction_down": "the gold price heading down",
|
|
58
|
+
"past_price": "any past information about gold prices",
|
|
59
|
+
"future_price": "any future information about gold prices",
|
|
60
|
+
"past_news": "any past information other than the gold prices",
|
|
61
|
+
"future_news": "any future information other than the gold prices",
|
|
62
|
+
"assert_comparison": "a comparison purely in the context of the gold commodity with another asset",
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def get_instructions(cls, category: str):
|
|
67
|
+
if category not in GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS:
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"Invalid category: '{category}' "
|
|
70
|
+
f"Valid categories are: {list(GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS.keys())}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return (
|
|
74
|
+
"The following are news headlines about the gold commodity. "
|
|
75
|
+
"Classify whether the news headline discusses "
|
|
76
|
+
f'{GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS[category]}. Answer only "Yes" or "No".'
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def __init__(self, category: str):
|
|
80
|
+
super().__init__()
|
|
81
|
+
if category not in GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS:
|
|
82
|
+
raise ValueError(
|
|
83
|
+
f"Invalid category: '{category}' "
|
|
84
|
+
f"Valid categories are: {list(GoldCommodityNewsScenario.CATEGORY_INSTRUCTIONS.keys())}"
|
|
85
|
+
)
|
|
86
|
+
self.column_name = GoldCommodityNewsScenario.CATEGORY_COLUMN_NAMES[category]
|
|
87
|
+
|
|
88
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
89
|
+
ensure_directory_exists(output_path)
|
|
90
|
+
data_path = os.path.join(output_path, "finalDataset_0208.csv")
|
|
91
|
+
|
|
92
|
+
ensure_file_downloaded(
|
|
93
|
+
source_url=(
|
|
94
|
+
"https://www.kaggle.com/api/v1/datasets/download/daittan/"
|
|
95
|
+
"gold-commodity-news-and-dimensions?dataset_version_number=1"
|
|
96
|
+
),
|
|
97
|
+
target_path=data_path,
|
|
98
|
+
unpack=True,
|
|
99
|
+
unpack_type="unzip",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
df = pd.read_csv(data_path)
|
|
103
|
+
|
|
104
|
+
instances: List[Instance] = []
|
|
105
|
+
for _, row in df.iterrows():
|
|
106
|
+
expected_output: str
|
|
107
|
+
if row[self.column_name] == 1:
|
|
108
|
+
expected_output = "Yes"
|
|
109
|
+
else:
|
|
110
|
+
expected_output = "No"
|
|
111
|
+
|
|
112
|
+
instance = Instance(
|
|
113
|
+
input=Input(text=str(row["News"])),
|
|
114
|
+
references=[Reference(Output(text=expected_output), tags=[CORRECT_TAG])],
|
|
115
|
+
split=str(TEST_SPLIT),
|
|
116
|
+
)
|
|
117
|
+
instances.append(instance)
|
|
118
|
+
|
|
119
|
+
# no explicit train/test split, so randomly assign 10% of rows to train
|
|
120
|
+
random.seed(0)
|
|
121
|
+
train_indexes = random.sample(list(range(len(instances))), k=math.floor(len(instances) / 10))
|
|
122
|
+
for train_index in train_indexes:
|
|
123
|
+
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
124
|
+
return instances
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
from typing import List
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.general import ensure_directory_exists
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
SUBSETS = ["gpqa_main", "gpqa_diamond", "gpqa_extended"]
|
|
19
|
+
|
|
20
|
+
# Train example indices below are found by indexing examples given in the original paper repo
|
|
21
|
+
TRAIN_EXAMPLE_INDICES = {
|
|
22
|
+
"gpqa_main": [339, 105],
|
|
23
|
+
"gpqa_diamond": [124, 39],
|
|
24
|
+
"gpqa_extended": [146, 330, 436],
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GPQAScenario(Scenario):
|
|
29
|
+
"""GPQA
|
|
30
|
+
|
|
31
|
+
GPQA is a multiple-choice, Q&A dataset of very hard questions written and validated by experts in biology, physics,
|
|
32
|
+
and chemistry. When attempting questions out of their own domain (e.g., a physicist answers a chemistry question),
|
|
33
|
+
these experts get only 34% accuracy, despite spending >30m with full access to Google."""
|
|
34
|
+
|
|
35
|
+
name = "gpqa"
|
|
36
|
+
description = "A Graduate-Level Google-Proof Q&A Benchmark"
|
|
37
|
+
tags = ["question answering"]
|
|
38
|
+
|
|
39
|
+
def __init__(self, subset: str, random_seed=42):
|
|
40
|
+
super().__init__()
|
|
41
|
+
assert subset in SUBSETS, "Unknown subset: {}".format(subset)
|
|
42
|
+
self.subset = subset
|
|
43
|
+
self.random_seed = random_seed
|
|
44
|
+
|
|
45
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
46
|
+
# Get GPQA from HuggingFace
|
|
47
|
+
cache_dir = os.path.join(output_path, "data")
|
|
48
|
+
ensure_directory_exists(cache_dir)
|
|
49
|
+
dataset = datasets.load_dataset(
|
|
50
|
+
"Idavidrein/gpqa",
|
|
51
|
+
self.subset,
|
|
52
|
+
trust_remote_code=True,
|
|
53
|
+
cache_dir=cache_dir,
|
|
54
|
+
split="train",
|
|
55
|
+
revision="90b8e5be2b1d3d2dbfe016cdab47981150600c4a",
|
|
56
|
+
)
|
|
57
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
58
|
+
|
|
59
|
+
# Read all instances
|
|
60
|
+
random.seed(self.random_seed)
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
for idx, row in enumerate(dataset):
|
|
63
|
+
input = Input(text=row["Question"].strip())
|
|
64
|
+
references = [
|
|
65
|
+
Reference(Output(text=row["Correct Answer"].strip()), tags=[CORRECT_TAG]),
|
|
66
|
+
Reference(Output(text=row["Incorrect Answer 1"].strip()), tags=[]),
|
|
67
|
+
Reference(Output(text=row["Incorrect Answer 2"].strip()), tags=[]),
|
|
68
|
+
Reference(Output(text=row["Incorrect Answer 3"].strip()), tags=[]),
|
|
69
|
+
]
|
|
70
|
+
random.shuffle(references)
|
|
71
|
+
if idx in TRAIN_EXAMPLE_INDICES[self.subset]:
|
|
72
|
+
extra_data = {
|
|
73
|
+
"chain_of_thought": row["Explanation"],
|
|
74
|
+
}
|
|
75
|
+
instance = Instance(input=input, references=references, split=TRAIN_SPLIT, extra_data=extra_data)
|
|
76
|
+
else:
|
|
77
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
78
|
+
instances.append(instance)
|
|
79
|
+
|
|
80
|
+
return instances
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
4
|
-
from .grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
|
|
3
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
4
|
+
from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class GrammarScenario(Scenario):
|
|
@@ -3,7 +3,16 @@ import os
|
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
7
16
|
|
|
8
17
|
|
|
9
18
|
class GSM8KScenario(Scenario):
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
|
|
7
|
+
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class HarmBenchGCGTransferScenario(Scenario):
|
|
11
|
+
"""
|
|
12
|
+
HarmBenchGCG-T is a standardized evaluation framework for automated red teaming.
|
|
13
|
+
HarmBench identifies key considerations previously unaccounted for in red teaming
|
|
14
|
+
evaluations and systematically designed prompts that meet these criteria.
|
|
15
|
+
|
|
16
|
+
https://arxiv.org/abs/2402.04249
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name = "harm_bench_gcg_transfer"
|
|
20
|
+
description = "HarmBench GCG-T dataset - standard and contextual attacks"
|
|
21
|
+
tags = ["safety"]
|
|
22
|
+
|
|
23
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
24
|
+
data_path = os.path.join(output_path, "data")
|
|
25
|
+
os.makedirs(data_path, exist_ok=True)
|
|
26
|
+
url = "https://raw.githubusercontent.com/farzaank/harmbench-gcg-ensembled/refs/heads/main/output.csv" # noqa: E501
|
|
27
|
+
outf_path = os.path.join(data_path, "harmbench_gcg.csv")
|
|
28
|
+
ensure_file_downloaded(
|
|
29
|
+
source_url=url,
|
|
30
|
+
target_path=outf_path,
|
|
31
|
+
unpack=False,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
df = pd.read_csv(outf_path)
|
|
35
|
+
|
|
36
|
+
# Read all the instances
|
|
37
|
+
instances: List[Instance] = []
|
|
38
|
+
# we take the direct_request template from HarmBench for simplicity
|
|
39
|
+
# see the example here
|
|
40
|
+
# https://github.com/centerforaisafety/HarmBench/blob/main/baselines/direct_request/direct_request.py
|
|
41
|
+
#
|
|
42
|
+
# in the future we can use other types of templates including various red team attacks
|
|
43
|
+
for i, row in df.iterrows():
|
|
44
|
+
references = [Reference(output=Output(text="unsafe behavior"), tags=[])]
|
|
45
|
+
input_text = row["behavior_prompt"]
|
|
46
|
+
input = Input(text=input_text)
|
|
47
|
+
id = row["behavior_name"]
|
|
48
|
+
instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
|
|
49
|
+
instances.append(instance)
|
|
50
|
+
return instances
|
|
@@ -4,7 +4,7 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
|
|
7
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class HarmBenchScenario(Scenario):
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from datasets import DatasetDict, load_dataset
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
CORRECT_TAG,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Input,
|
|
10
|
+
Instance,
|
|
11
|
+
Output,
|
|
12
|
+
Reference,
|
|
13
|
+
Scenario,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.general import ensure_directory_exists
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HeadQAScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
From "HEAD-QA: A Healthcare Dataset for Complex Reasoning" (Vilares et al.), HEAD-QA is a multi-choice
|
|
21
|
+
question-answering dataset designed to evaluate reasoning on challenging healthcare-related questions.
|
|
22
|
+
The questions are sourced from Spanish healthcare exams for specialized positions, covering various topics
|
|
23
|
+
such as Medicine, Nursing, Psychology, Chemistry, Pharmacology, and Biology.
|
|
24
|
+
|
|
25
|
+
Example from the dataset:
|
|
26
|
+
|
|
27
|
+
Question:
|
|
28
|
+
The excitatory postsynaptic potentials:
|
|
29
|
+
|
|
30
|
+
A) They are all or nothing.
|
|
31
|
+
B) They are hyperpolarizing.
|
|
32
|
+
C) They can be added.
|
|
33
|
+
D) They spread long distances.
|
|
34
|
+
|
|
35
|
+
Answer:
|
|
36
|
+
The answer is C. Explanation: None provided in this dataset.
|
|
37
|
+
|
|
38
|
+
@InProceedings{HEAD-QA,
|
|
39
|
+
author = {David Vilares and Manuel Vilares and Carlos Gómez-Rodríguez},
|
|
40
|
+
title = {HEAD-QA: A Healthcare Dataset for Complex Reasoning},
|
|
41
|
+
year = {2019},
|
|
42
|
+
abstract = {We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex
|
|
43
|
+
reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system,
|
|
44
|
+
and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and
|
|
45
|
+
cross-lingual (to English) experiments with information retrieval and neural techniques. We show that:
|
|
46
|
+
(i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance,
|
|
47
|
+
demonstrating its usefulness as a benchmark for future work.}}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
Task:
|
|
51
|
+
Given a question and its multiple-choice answers, models must identify the correct answer, corresponding to the
|
|
52
|
+
`ra` field in the dataset. The dataset spans six healthcare domains and is challenging even for experts.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
HUGGING_FACE_DATASET_PATH: str = "dvilares/head_qa"
|
|
56
|
+
SKIP_VQA: bool = True
|
|
57
|
+
SKIP_TEXTQA: bool = False
|
|
58
|
+
|
|
59
|
+
name = "head_qa"
|
|
60
|
+
description = "A collection of biomedical multiple-choice questions for testing medical knowledge."
|
|
61
|
+
tags = ["question_answering", "biomedical", "medicine"]
|
|
62
|
+
|
|
63
|
+
def __init__(self, language: str = "en", category: Optional[str] = None):
|
|
64
|
+
"""Initialize the HEAD-QA scenario.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
language (str, optional): Language of the dataset. Defaults to "en".
|
|
68
|
+
category (str, optional): Category of the dataset. If None, all categories are used.
|
|
69
|
+
"""
|
|
70
|
+
super().__init__()
|
|
71
|
+
self.language: str = language
|
|
72
|
+
self.category: Optional[str] = category
|
|
73
|
+
assert (
|
|
74
|
+
self.SKIP_VQA or self.SKIP_TEXTQA
|
|
75
|
+
), "Failed to initialize HeadQAScenario, one of `SKIP_VQA` or `SKIP_TEXTQA` must be True."
|
|
76
|
+
|
|
77
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
78
|
+
data_path: str = os.path.join(output_path, "data")
|
|
79
|
+
ensure_directory_exists(data_path)
|
|
80
|
+
dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH, self.language)
|
|
81
|
+
|
|
82
|
+
# XXX: Should we consider validation as test too?
|
|
83
|
+
# splits = {TRAIN_SPLIT: ["train", "validation"], TEST_SPLIT: ["test"]}
|
|
84
|
+
# Limit to zero shot setting
|
|
85
|
+
splits = {TEST_SPLIT: ["test"]}
|
|
86
|
+
instances: List[Instance] = []
|
|
87
|
+
for (
|
|
88
|
+
helm_split_name,
|
|
89
|
+
dataset_splits_name,
|
|
90
|
+
) in splits.items(): # Iterate over the splits
|
|
91
|
+
for dataset_split_name in dataset_splits_name:
|
|
92
|
+
split_data = dataset[dataset_split_name]
|
|
93
|
+
|
|
94
|
+
for example in split_data:
|
|
95
|
+
# Whether to process Visual Question Answering (VQA) examples
|
|
96
|
+
if self.SKIP_VQA and example["image"] is not None:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Whether to process Text Question Answering (TextQA) examples
|
|
100
|
+
if self.SKIP_TEXTQA and example["image"] is None:
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
# If specified, filter by category
|
|
104
|
+
if self.category is not None:
|
|
105
|
+
if example["category"] != self.category:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
question = example["qtext"]
|
|
109
|
+
|
|
110
|
+
# Format the final answer with explanation
|
|
111
|
+
instances.append(
|
|
112
|
+
Instance(
|
|
113
|
+
input=Input(text=question),
|
|
114
|
+
references=[
|
|
115
|
+
Reference(
|
|
116
|
+
Output(text=option["atext"]),
|
|
117
|
+
tags=[CORRECT_TAG] if option["aid"] == example["ra"] else [],
|
|
118
|
+
)
|
|
119
|
+
for option in example["answers"]
|
|
120
|
+
],
|
|
121
|
+
split=helm_split_name,
|
|
122
|
+
extra_data={
|
|
123
|
+
"id": example["qid"],
|
|
124
|
+
"name": example["name"],
|
|
125
|
+
"category": example["category"],
|
|
126
|
+
"year": example["year"],
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return instances
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Input,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_DATA_DIRRECTORY_PATH = "restricted/helpdesk_call_summarization/HELM Sample Transcripts_20241221_0045"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HelpdeskCallSummarizationScenario(Scenario):
|
|
17
|
+
"""Helpdesk call summarization."""
|
|
18
|
+
|
|
19
|
+
name = "helpdesk_call_summarization"
|
|
20
|
+
description = "Helpdesk call summarization."
|
|
21
|
+
tags = ["helpdesk_call_center"]
|
|
22
|
+
|
|
23
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
24
|
+
instances: List[Instance] = []
|
|
25
|
+
for file_name in os.listdir(_DATA_DIRRECTORY_PATH):
|
|
26
|
+
if not file_name.endswith(".csv") or not file_name.startswith("Call1-"):
|
|
27
|
+
continue
|
|
28
|
+
file_path = os.path.join(_DATA_DIRRECTORY_PATH, file_name)
|
|
29
|
+
with open(file_path) as f:
|
|
30
|
+
csv_reader = csv.reader(f)
|
|
31
|
+
prompt_lines = [f"{row[0]}: {row[4]}" for row in csv_reader]
|
|
32
|
+
prompt = "\n".join(prompt_lines)
|
|
33
|
+
instance_id = file_name.removeprefix("Call1-").removesuffix(".csv")
|
|
34
|
+
input = Input(text=prompt)
|
|
35
|
+
instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
|
|
36
|
+
instances.append(instance)
|
|
37
|
+
return instances
|