crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from filelock import FileLock
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
|
|
10
|
+
generate_schema_prompt,
|
|
11
|
+
)
|
|
12
|
+
from helm.benchmark.scenarios.scenario import (
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Scenario,
|
|
15
|
+
Instance,
|
|
16
|
+
Reference,
|
|
17
|
+
VALID_SPLIT,
|
|
18
|
+
Input,
|
|
19
|
+
Output,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ensure_file_unzipped(source_path: str, target_path: str):
|
|
24
|
+
with FileLock(f"{target_path}.lock"):
|
|
25
|
+
if os.path.exists(target_path):
|
|
26
|
+
hlog(f"Not decompressing {source_path} because {target_path} already exists")
|
|
27
|
+
return
|
|
28
|
+
tmp_path = target_path + ".tmp"
|
|
29
|
+
ensure_directory_exists(tmp_path)
|
|
30
|
+
shell(["unzip", source_path, "-d", tmp_path])
|
|
31
|
+
shell(["mv", tmp_path, target_path])
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SpiderScenario(Scenario):
|
|
35
|
+
"""Spider 1.0"""
|
|
36
|
+
|
|
37
|
+
name = "spider"
|
|
38
|
+
description = "spider"
|
|
39
|
+
tags = ["sql"]
|
|
40
|
+
|
|
41
|
+
INSTRUCTIONS_PROMPT = """-- Using valid SQLite, answer the following questions for the tables provided above.
|
|
42
|
+
"""
|
|
43
|
+
COT_PROMPT = """
|
|
44
|
+
Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
|
|
45
|
+
|
|
46
|
+
<reasoning>
|
|
47
|
+
INSERT_YOUR_REASONING_HERE
|
|
48
|
+
</reasoning>
|
|
49
|
+
<sql>
|
|
50
|
+
INSERT_YOUR_SQL_QUERY_HERE
|
|
51
|
+
</sql>""" # noqa: E501
|
|
52
|
+
|
|
53
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
54
|
+
data_parent_path = os.path.join(output_path, "data")
|
|
55
|
+
ensure_file_downloaded(
|
|
56
|
+
"https://drive.google.com/uc?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&export=download&confirm=t",
|
|
57
|
+
data_parent_path,
|
|
58
|
+
unpack=True,
|
|
59
|
+
unpack_type="unzip",
|
|
60
|
+
)
|
|
61
|
+
data_root_path = os.path.join(data_parent_path, "spider_data")
|
|
62
|
+
databases_root_path = os.path.join(data_root_path, "test_database")
|
|
63
|
+
|
|
64
|
+
database_schema_prompts: Dict[str, str] = {}
|
|
65
|
+
for database_name in os.listdir(databases_root_path):
|
|
66
|
+
database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
|
|
67
|
+
if not os.path.exists(database_path):
|
|
68
|
+
# Ignore stray ".DS_Store" directory
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
|
|
72
|
+
database_schema_prompts[database_name] = database_schema_prompt
|
|
73
|
+
|
|
74
|
+
instances: List[Instance] = []
|
|
75
|
+
dataset_path = os.path.join(data_root_path, "test.json")
|
|
76
|
+
dataset = json.load(open(dataset_path, "r"))
|
|
77
|
+
for row in dataset:
|
|
78
|
+
database_id: str = row["db_id"]
|
|
79
|
+
question: str = row["question"]
|
|
80
|
+
gold_sql: str = row["query"]
|
|
81
|
+
|
|
82
|
+
schema_prompt = database_schema_prompts[database_id]
|
|
83
|
+
combined_prompt = schema_prompt + "\n\n" + self.INSTRUCTIONS_PROMPT + question + self.COT_PROMPT
|
|
84
|
+
instance = Instance(
|
|
85
|
+
input=Input(text=combined_prompt),
|
|
86
|
+
references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
|
|
87
|
+
extra_data={"db_id": row["db_id"]},
|
|
88
|
+
split=VALID_SPLIT,
|
|
89
|
+
)
|
|
90
|
+
instances.append(instance)
|
|
91
|
+
return instances
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import csv
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_directory_exists
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Input,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StarrPatientInstructionsScenario(Scenario):
|
|
18
|
+
"""
|
|
19
|
+
Starr Patient Instructions is a dataset created from STARR-OMOP data, containing after-visit instructions
|
|
20
|
+
for outpatient surgeries/procedures. Each example corresponds to one surgery or procedure case (only including
|
|
21
|
+
outpatient or observation/overnight cases with discharge within 24 hours) and includes the following fields:
|
|
22
|
+
|
|
23
|
+
- Diagnosis: Why the patient needs the surgery/procedure.
|
|
24
|
+
- ActualProcedure: The surgery/procedure name.
|
|
25
|
+
- HistoryPhysicalNoteText: The History & Physical note written by the surgeon.
|
|
26
|
+
- OperativeNoteText: The report describing what was done during the surgery/procedure.
|
|
27
|
+
- DischargeInstructionNoteText: The specific after-surgery care instructions given to the patient.
|
|
28
|
+
|
|
29
|
+
The task is to generate personalized post-procedure patient instructions based on the provided case details.
|
|
30
|
+
|
|
31
|
+
Sample Synthetic Prompt:
|
|
32
|
+
Given the following case details, generate personalized after-surgery care instructions.
|
|
33
|
+
|
|
34
|
+
Diagnosis: [diagnosis text]
|
|
35
|
+
Procedure: [actual procedure text]
|
|
36
|
+
History & Physical: [H&P note text]
|
|
37
|
+
Operative Report: [operative note text]
|
|
38
|
+
|
|
39
|
+
Patient Instructions:
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
name = "starr_patient_instructions"
|
|
43
|
+
description = "A dataset containing case details used to generate customized post-procedure patient instructions."
|
|
44
|
+
tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
csv_path = "/share/pi/nigam/suhana/medhelm/data/starr-omop-personalized-care-instr/dataset_cases_qc.csv"
|
|
48
|
+
# Ensure the directory for the CSV file exists.
|
|
49
|
+
ensure_directory_exists(os.path.dirname(csv_path))
|
|
50
|
+
|
|
51
|
+
instances: List[Instance] = []
|
|
52
|
+
# For now, we assign all instances to the test split (zero-shot setting).
|
|
53
|
+
split = TEST_SPLIT
|
|
54
|
+
|
|
55
|
+
with open(csv_path, "r", encoding="utf-8") as csvfile:
|
|
56
|
+
reader = csv.DictReader(csvfile)
|
|
57
|
+
for row in reader:
|
|
58
|
+
# Retrieve and strip the relevant fields.
|
|
59
|
+
qc_value = row.get("QC", "").strip().upper()
|
|
60
|
+
if qc_value != "TRUE":
|
|
61
|
+
continue
|
|
62
|
+
diagnosis = row.get("Diagnosis", "").strip()
|
|
63
|
+
actual_procedure = row.get("ActualProcedure", "").strip()
|
|
64
|
+
history_physical = row.get("HistoryPhysicalNoteText", "").strip()
|
|
65
|
+
operative_note = row.get("OperativeNoteText", "").strip()
|
|
66
|
+
discharge_instruction = row.get("DischargeInstructionNoteText", "").strip()
|
|
67
|
+
|
|
68
|
+
# Skip the instance if any required field is missing.
|
|
69
|
+
if not (
|
|
70
|
+
diagnosis and actual_procedure and history_physical and operative_note and discharge_instruction
|
|
71
|
+
):
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# Construct the input prompt by concatenating the fields.
|
|
75
|
+
input_text = (
|
|
76
|
+
f"Diagnosis: {diagnosis}\n"
|
|
77
|
+
f"Procedure: {actual_procedure}\n"
|
|
78
|
+
f"History & Physical: {history_physical}\n"
|
|
79
|
+
f"Operative Report: {operative_note}\n\n"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
instances.append(
|
|
83
|
+
Instance(
|
|
84
|
+
input=Input(text=input_text),
|
|
85
|
+
references=[Reference(Output(text=discharge_instruction), tags=[CORRECT_TAG])],
|
|
86
|
+
split=split,
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return instances
|
|
@@ -3,7 +3,17 @@ import pickle
|
|
|
3
3
|
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
class SummarizationScenario(Scenario):
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
TRAIN_SPLIT,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
# Needed for pandas.read_excel
|
|
22
|
+
import openpyxl # noqa
|
|
23
|
+
except ModuleNotFoundError as e:
|
|
24
|
+
handle_module_not_found_error(e, ["ibm-enterprise-scenarios"])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SUMOSumScenario(Scenario):
|
|
28
|
+
"""SUMO Web Claims Summarization
|
|
29
|
+
|
|
30
|
+
SUMO Web Claims Summarization is a summarization task over the climate subset from the SUMO dataset.
|
|
31
|
+
The task is to write a title based on the article contents.
|
|
32
|
+
|
|
33
|
+
Citation:
|
|
34
|
+
@inproceedings{mishra-etal-2020-generating,
|
|
35
|
+
title = "Generating Fact Checking Summaries for Web Claims",
|
|
36
|
+
author = "Mishra, Rahul and
|
|
37
|
+
Gupta, Dhruv and
|
|
38
|
+
Leippold, Markus",
|
|
39
|
+
editor = "Xu, Wei and
|
|
40
|
+
Ritter, Alan and
|
|
41
|
+
Baldwin, Tim and
|
|
42
|
+
Rahimi, Afshin",
|
|
43
|
+
booktitle = "Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)",
|
|
44
|
+
month = nov,
|
|
45
|
+
year = "2020",
|
|
46
|
+
address = "Online",
|
|
47
|
+
publisher = "Association for Computational Linguistics",
|
|
48
|
+
url = "https://aclanthology.org/2020.wnut-1.12",
|
|
49
|
+
doi = "10.18653/v1/2020.wnut-1.12",
|
|
50
|
+
pages = "81--90",
|
|
51
|
+
abstract = "We present SUMO, a neural attention-based approach that learns to establish correctness of textual claims based on evidence in the form of text documents (e.g., news articles or web documents). SUMO further generates an extractive summary by presenting a diversified set of sentences from the documents that explain its decision on the correctness of the textual claim. Prior approaches to address the problem of fact checking and evidence extraction have relied on simple concatenation of claim and document word embeddings as an input to claim driven attention weight computation. This is done so as to extract salient words and sentences from the documents that help establish the correctness of the claim. However this design of claim-driven attention fails to capture the contextual information in documents properly. We improve on the prior art by using improved claim and title guided hierarchical attention to model effective contextual cues. We show the efficacy of our approach on political, healthcare, and environmental datasets.",
|
|
52
|
+
}
|
|
53
|
+
""" # noqa: E501
|
|
54
|
+
|
|
55
|
+
name = "sumosum"
|
|
56
|
+
description = "Text summarization with climate corpus"
|
|
57
|
+
tags = ["summarization", "climate"]
|
|
58
|
+
|
|
59
|
+
TRAIN_RATIO = 0.2
|
|
60
|
+
TITLE_KEY = "Title"
|
|
61
|
+
DOCUMENT_KEY = "Doc_text"
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
train_filter_min_length: Optional[int] = None,
|
|
66
|
+
train_filter_max_length: Optional[int] = None,
|
|
67
|
+
test_filter_min_length: Optional[int] = None,
|
|
68
|
+
test_filter_max_length: Optional[int] = None,
|
|
69
|
+
truncate_length: Optional[int] = None,
|
|
70
|
+
):
|
|
71
|
+
"""
|
|
72
|
+
Initializes the scenario.
|
|
73
|
+
Args:
|
|
74
|
+
train_filter_min_length: Int indicating minimum length for training
|
|
75
|
+
documents. Train examples smaller than
|
|
76
|
+
train_filter_min_length tokens will be filtered out.
|
|
77
|
+
train_filter_max_length: Int indicating maximum length for training
|
|
78
|
+
documents. Train examples larger than
|
|
79
|
+
train_filter_max_length tokens will be filtered out.
|
|
80
|
+
test_filter_min_length: Int indicating minimum length for training
|
|
81
|
+
documents. Test examples smaller than
|
|
82
|
+
test_filter_min_length tokens will be filtered out.
|
|
83
|
+
test_filter_max_length: Int indicating maximum length for training
|
|
84
|
+
documents. Test examples larger than
|
|
85
|
+
test_filter_max_length tokens will be filtered out.
|
|
86
|
+
truncate_length: Int indicating the maximum length in tokens to
|
|
87
|
+
truncate documents. Documents in all splits will be
|
|
88
|
+
truncated to truncate_length tokens.
|
|
89
|
+
NOTE: Whitespace tokenization is used to compute tokens.
|
|
90
|
+
"""
|
|
91
|
+
super().__init__()
|
|
92
|
+
self.train_filter_min_length = train_filter_min_length
|
|
93
|
+
self.train_filter_max_length = train_filter_max_length
|
|
94
|
+
self.test_filter_min_length = test_filter_min_length
|
|
95
|
+
self.test_filter_max_length = test_filter_max_length
|
|
96
|
+
self.truncate_length = truncate_length
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def _clean_and_truncate(text: str, max_length: Optional[int] = None) -> str:
|
|
100
|
+
text = re.sub(r"\s+", " ", text)
|
|
101
|
+
return " ".join(text.split()[:max_length])
|
|
102
|
+
|
|
103
|
+
def _load_dataset(self, output_path: str) -> Dict[str, pd.DataFrame]:
|
|
104
|
+
data_dir = os.path.join(output_path, "data")
|
|
105
|
+
ensure_directory_exists(data_dir)
|
|
106
|
+
|
|
107
|
+
source_url = "https://github.com/rahulOmishra/SUMO/raw/main/climate_claims_raw.xlsx"
|
|
108
|
+
source_file = os.path.basename(source_url)
|
|
109
|
+
target_path = os.path.join(data_dir, source_file)
|
|
110
|
+
ensure_file_downloaded(
|
|
111
|
+
source_url=source_url,
|
|
112
|
+
target_path=target_path,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Column headers: Claim_id(int),Claim,Title,Doc_text,Label(bool)
|
|
116
|
+
target_df = pd.read_excel(target_path, skiprows=1)
|
|
117
|
+
target_df = target_df.dropna(subset=[SUMOSumScenario.TITLE_KEY, SUMOSumScenario.DOCUMENT_KEY])
|
|
118
|
+
# Remove carriage return _x000D_ in Excel string
|
|
119
|
+
target_df = target_df.replace({r"_x000D_": ""}, regex=True)
|
|
120
|
+
# target_df = target_df.replace({r"_x([0-9a-fA-F]{4})_": ""}, regex=True)
|
|
121
|
+
# Split randomly (works better than split by order)
|
|
122
|
+
train_df = target_df.sample(frac=SUMOSumScenario.TRAIN_RATIO, random_state=0)
|
|
123
|
+
test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
|
|
124
|
+
return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
|
|
125
|
+
|
|
126
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
127
|
+
dataset_dict = self._load_dataset(output_path)
|
|
128
|
+
|
|
129
|
+
instances: List[Instance] = []
|
|
130
|
+
|
|
131
|
+
for split, split_data in dataset_dict.items():
|
|
132
|
+
for example in split_data.itertuples():
|
|
133
|
+
document = getattr(example, SUMOSumScenario.DOCUMENT_KEY)
|
|
134
|
+
title = getattr(example, SUMOSumScenario.TITLE_KEY)
|
|
135
|
+
art_len = len(document.split())
|
|
136
|
+
if split == TEST_SPLIT:
|
|
137
|
+
if self.test_filter_max_length and art_len > self.test_filter_max_length:
|
|
138
|
+
continue
|
|
139
|
+
if self.test_filter_min_length and art_len < self.test_filter_min_length:
|
|
140
|
+
continue
|
|
141
|
+
if split == TRAIN_SPLIT:
|
|
142
|
+
if self.train_filter_max_length and art_len > self.train_filter_max_length:
|
|
143
|
+
continue
|
|
144
|
+
if self.train_filter_min_length and art_len < self.train_filter_min_length:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
document = SUMOSumScenario._clean_and_truncate(document, self.truncate_length)
|
|
148
|
+
title = SUMOSumScenario._clean_and_truncate(title)
|
|
149
|
+
|
|
150
|
+
instance = Instance(
|
|
151
|
+
input=Input(text=document),
|
|
152
|
+
references=[Reference(output=Output(text=title), tags=[CORRECT_TAG])],
|
|
153
|
+
split=split,
|
|
154
|
+
)
|
|
155
|
+
instances.append(instance)
|
|
156
|
+
|
|
157
|
+
return instances
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from .scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
6
6
|
|
|
7
7
|
NUM_INPUT_TOKENS: List[int] = [
|
|
8
8
|
1,
|
|
@@ -67,7 +67,17 @@ from copy import copy
|
|
|
67
67
|
from typing import List, Dict, Literal, Tuple
|
|
68
68
|
from dataclasses import dataclass
|
|
69
69
|
|
|
70
|
-
from .scenario import
|
|
70
|
+
from helm.benchmark.scenarios.scenario import (
|
|
71
|
+
Scenario,
|
|
72
|
+
Instance,
|
|
73
|
+
Reference,
|
|
74
|
+
TRAIN_SPLIT,
|
|
75
|
+
VALID_SPLIT,
|
|
76
|
+
TEST_SPLIT,
|
|
77
|
+
CORRECT_TAG,
|
|
78
|
+
Input,
|
|
79
|
+
Output,
|
|
80
|
+
)
|
|
71
81
|
|
|
72
82
|
|
|
73
83
|
@dataclass(frozen=True)
|
|
@@ -25,7 +25,17 @@ The model hence is asked to do the following three tasks:
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
from typing import List, Dict, Tuple
|
|
27
27
|
|
|
28
|
-
from .scenario import
|
|
28
|
+
from helm.benchmark.scenarios.scenario import (
|
|
29
|
+
Scenario,
|
|
30
|
+
Instance,
|
|
31
|
+
Reference,
|
|
32
|
+
TRAIN_SPLIT,
|
|
33
|
+
VALID_SPLIT,
|
|
34
|
+
TEST_SPLIT,
|
|
35
|
+
CORRECT_TAG,
|
|
36
|
+
Input,
|
|
37
|
+
Output,
|
|
38
|
+
)
|
|
29
39
|
|
|
30
40
|
ANIMALS = ["zebra", "cobra", "stork", "penguin", "shark", "lion", "buffalo", "whale", "seal", "eagle", "horse", "rat"]
|
|
31
41
|
FRUITS = ["apple", "peach", "watermelon", "banana", "grape", "kiwi", "pear", "strawberry", "blueberry", "blackberry"]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.bigcodebench_scenario import BigCodeBenchScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_bigcodebench_scenario_get_instances():
|
|
10
|
+
bigcodebench_scenario = BigCodeBenchScenario("v0.1.2")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = bigcodebench_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 1140
|
|
14
|
+
assert instances[0].id == "BigCodeBench/0"
|
|
15
|
+
assert instances[0].input == Input(
|
|
16
|
+
text=(
|
|
17
|
+
"Calculates the average of the sums of absolute differences between each pair "
|
|
18
|
+
"of consecutive numbers for all permutations of a given list. Each permutation "
|
|
19
|
+
"is shuffled before calculating the differences. Args: - numbers (list): A list "
|
|
20
|
+
"of numbers. Default is numbers from 1 to 10.\nThe function should output with:\n"
|
|
21
|
+
" float: The average of the sums of absolute differences for each shuffled permutation "
|
|
22
|
+
"of the list.\nYou should write self-contained code starting with:\n```\nimport itertools\n"
|
|
23
|
+
"from random import shuffle\ndef task_func(numbers=list(range(1, 3))):\n```"
|
|
24
|
+
)
|
|
25
|
+
)
|
|
26
|
+
assert instances[0].split == TEST_SPLIT
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.czech_bank_qa_scenario import CzechBankQAScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_czech_bank_qa_scenario_get_instances():
|
|
10
|
+
scenario = CzechBankQAScenario(config_name="default")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 30
|
|
14
|
+
assert actual_instances[0].input == Input(text="Get the total number of accounts in the system")
|
|
15
|
+
assert len(actual_instances[0].references) == 1
|
|
16
|
+
assert actual_instances[0].references[0].tags == [CORRECT_TAG]
|
|
17
|
+
assert actual_instances[0].references[0].output.text == "SELECT COUNT(*) FROM ACCOUNT"
|
|
18
|
+
assert actual_instances[0].split == "test"
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.enem_challenge_scenario import ENEMChallengeScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_enem_challenge_scenario():
|
|
10
|
+
enem_scenario = ENEMChallengeScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = enem_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 1431
|
|
14
|
+
assert instances[0].split == TEST_SPLIT
|
|
15
|
+
|
|
16
|
+
assert instances[0].input.text.startswith(
|
|
17
|
+
"A atmosfera terrestre é composta pelos gases nitrogênio (N2) e oxigênio (O2)"
|
|
18
|
+
)
|
|
19
|
+
assert len(instances[0].input.text) == 1163
|
|
20
|
+
|
|
21
|
+
assert instances[0].references == [
|
|
22
|
+
Reference(
|
|
23
|
+
output=Output(
|
|
24
|
+
text="reduzir o calor irradiado pela Terra mediante a substituição da produção primária pela industrialização refrigerada. " # noqa: E501
|
|
25
|
+
),
|
|
26
|
+
tags=[],
|
|
27
|
+
),
|
|
28
|
+
Reference(
|
|
29
|
+
output=Output(
|
|
30
|
+
text="promover a queima da biomassa vegetal, responsável pelo aumento do efeito estufa devido à produção de CH4. " # noqa: E501
|
|
31
|
+
),
|
|
32
|
+
tags=[],
|
|
33
|
+
),
|
|
34
|
+
Reference(
|
|
35
|
+
output=Output(
|
|
36
|
+
text="reduzir o desmatamento, mantendo-se, assim, o potencial da vegetação em absorver o CO2 da atmosfera. " # noqa: E501
|
|
37
|
+
),
|
|
38
|
+
tags=[CORRECT_TAG],
|
|
39
|
+
),
|
|
40
|
+
Reference(
|
|
41
|
+
output=Output(
|
|
42
|
+
text="aumentar a concentração atmosférica de H2O, molécula capaz de absorver grande quantidade de calor. " # noqa: E501
|
|
43
|
+
),
|
|
44
|
+
tags=[],
|
|
45
|
+
),
|
|
46
|
+
Reference(
|
|
47
|
+
output=Output(
|
|
48
|
+
text="remover moléculas orgânicas polares da atmosfera, diminuindo a capacidade delas de reter calor. " # noqa: E501
|
|
49
|
+
),
|
|
50
|
+
tags=[],
|
|
51
|
+
),
|
|
52
|
+
]
|
|
53
|
+
assert instances[0].references[2].is_correct
|
|
@@ -13,9 +13,13 @@ def test_ewok_scenario():
|
|
|
13
13
|
with TemporaryDirectory() as tmpdir:
|
|
14
14
|
try:
|
|
15
15
|
instances = scenario.get_instances(tmpdir)
|
|
16
|
-
except DatasetNotFoundError:
|
|
16
|
+
except (DatasetNotFoundError, Exception):
|
|
17
|
+
# Catch generic Exception here because the Hugging Face Hub client can raise
|
|
18
|
+
# an authentication issue as the following exception:
|
|
19
|
+
# huggingface_hub.utils._errors.GatedRepoError: 401 Client Error.
|
|
20
|
+
# but GatedRepoError is a type in a private package, so we avoid referencing it
|
|
17
21
|
pytest.skip("Unable to access gated dataset on Hugging Face Hub; skipping test")
|
|
18
|
-
assert len(instances) ==
|
|
22
|
+
assert len(instances) == 8750
|
|
19
23
|
assert "believes" in instances[0].input.text
|
|
20
24
|
assert len(instances[0].references) == 2
|
|
21
25
|
assert "inside" in instances[0].references[0].output.text
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.gold_commodity_news_scenario import GoldCommodityNewsScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_legalbench_scenario():
|
|
10
|
+
scenario = GoldCommodityNewsScenario(category="price_or_not")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 11412
|
|
14
|
+
assert instances[0].input == Input(text="april gold down 20 cents to settle at $1,116.10/oz")
|
|
15
|
+
assert instances[0].references == [
|
|
16
|
+
Reference(output=Output(text="Yes"), tags=[CORRECT_TAG]),
|
|
17
|
+
]
|
|
18
|
+
assert instances[0].split == "test"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
from helm.benchmark.scenarios.gpqa_scenario import GPQAScenario
|
|
4
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.mark.scenarios
|
|
8
|
+
def test_gpqa_scenario():
|
|
9
|
+
with TemporaryDirectory() as tmpdir:
|
|
10
|
+
scenario = GPQAScenario(subset="gpqa_main")
|
|
11
|
+
instances = scenario.get_instances(tmpdir)
|
|
12
|
+
assert len(instances) == 448
|
|
13
|
+
assert instances[0].split == "test"
|
|
14
|
+
assert len(instances[0].input.text) == 689
|
|
15
|
+
references = instances[0].references
|
|
16
|
+
assert len(references[0].output.text) == 10
|
|
17
|
+
assert len(references[1].output.text) == 6
|
|
18
|
+
assert len(references[2].output.text) == 9
|
|
19
|
+
assert len(references[3].output.text) == 7
|
|
20
|
+
assert references[3].tags == [CORRECT_TAG]
|
|
21
|
+
|
|
22
|
+
scenario = GPQAScenario(subset="gpqa_diamond")
|
|
23
|
+
instances = scenario.get_instances(tmpdir)
|
|
24
|
+
assert len(instances) == 198
|
|
25
|
+
assert instances[0].split == "test"
|
|
26
|
+
assert len(instances[0].input.text) == 262
|
|
27
|
+
references = instances[0].references
|
|
28
|
+
assert len(references[0].output.text) == 8
|
|
29
|
+
assert len(references[1].output.text) == 9
|
|
30
|
+
assert len(references[2].output.text) == 8
|
|
31
|
+
assert len(references[3].output.text) == 8
|
|
32
|
+
assert references[3].tags == [CORRECT_TAG]
|
|
33
|
+
|
|
34
|
+
scenario = GPQAScenario(subset="gpqa_extended")
|
|
35
|
+
instances = scenario.get_instances(tmpdir)
|
|
36
|
+
assert len(instances) == 546
|
|
37
|
+
assert instances[0].split == "test"
|
|
38
|
+
assert len(instances[0].input.text) == 689
|
|
39
|
+
references = instances[0].references
|
|
40
|
+
assert len(references[0].output.text) == 10
|
|
41
|
+
assert len(references[1].output.text) == 6
|
|
42
|
+
assert len(references[2].output.text) == 9
|
|
43
|
+
assert len(references[3].output.text) == 7
|
|
44
|
+
assert references[3].tags == [CORRECT_TAG]
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.ifeval_scenario import IFEvalScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_ifeval_scenario_get_instances():
|
|
10
|
+
ifeval_scenario = IFEvalScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = ifeval_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 541
|
|
14
|
+
assert instances[0].input == Input(
|
|
15
|
+
text=(
|
|
16
|
+
"Write a 300+ word summary of the wikipedia page "
|
|
17
|
+
'"https://en.wikipedia.org/wiki/Raymond_III,_Count_of_Tripoli". '
|
|
18
|
+
"Do not use any commas and highlight at least 3 sections that has titles in markdown format, "
|
|
19
|
+
"for example *highlighted section part 1*, *highlighted section part 2*, "
|
|
20
|
+
"*highlighted section part 3*."
|
|
21
|
+
)
|
|
22
|
+
)
|
|
23
|
+
assert instances[0].split == TEST_SPLIT
|
|
24
|
+
assert instances[0].extra_data
|
|
25
|
+
assert instances[0].extra_data["instruction_ids"] == [
|
|
26
|
+
"punctuation:no_comma",
|
|
27
|
+
"detectable_format:number_highlighted_sections",
|
|
28
|
+
"length_constraints:number_words",
|
|
29
|
+
]
|
|
30
|
+
kwargs_groups = instances[0].extra_data["instruction_kwargs"]
|
|
31
|
+
assert all(_ is None for _ in kwargs_groups[0].values())
|
|
32
|
+
assert kwargs_groups[1]["num_highlights"] == 3
|
|
33
|
+
assert all(kwargs_groups[1][key] is None for key in kwargs_groups[1] if key != "num_highlights")
|
|
34
|
+
assert kwargs_groups[2]["relation"] == "at least"
|
|
35
|
+
assert kwargs_groups[2]["num_words"] == 300
|
|
36
|
+
assert all(kwargs_groups[2][key] is None for key in kwargs_groups[2] if key not in {"relation", "num_words"})
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.imdb_ptbr_scenario import IMDB_PTBRScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_imdb_ptbr_scenario():
|
|
10
|
+
imdb_ptbr = IMDB_PTBRScenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = imdb_ptbr.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 30000
|
|
14
|
+
assert instances[0].split == TRAIN_SPLIT
|
|
15
|
+
|
|
16
|
+
assert instances[10].input.text.startswith(
|
|
17
|
+
"Foi ótimo ver algumas das minhas estrelas favoritas de 30 anos atrás, "
|
|
18
|
+
"incluindo John Ritter, Ben Gazarra e Audrey Hepburn."
|
|
19
|
+
)
|
|
20
|
+
assert len(instances[10].input.text) == 1549
|
|
21
|
+
|
|
22
|
+
assert instances[10].references == [
|
|
23
|
+
Reference(
|
|
24
|
+
output=Output(text="negativo"),
|
|
25
|
+
tags=[CORRECT_TAG],
|
|
26
|
+
)
|
|
27
|
+
]
|