crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCBMTMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
This benchmark dataset was built from a patient status gold-standard
|
|
21
|
+
for specific questions asked after a bone marrow transplant has taken place.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name = "shc_bmt_med"
|
|
25
|
+
description = (
|
|
26
|
+
"A dataset containing patient notes with associated "
|
|
27
|
+
"questions and answers related to bone marrow transplantation."
|
|
28
|
+
)
|
|
29
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
30
|
+
|
|
31
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
32
|
+
|
|
33
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
34
|
+
data = {}
|
|
35
|
+
with open(csv_path, "r") as file:
|
|
36
|
+
reader = csv.DictReader(file)
|
|
37
|
+
for row in reader:
|
|
38
|
+
question = row["prompt"]
|
|
39
|
+
context = row["context"]
|
|
40
|
+
answer = row["label"]
|
|
41
|
+
prompt = (
|
|
42
|
+
f"Provide an answer to the following {question} with the following context: {context} "
|
|
43
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
44
|
+
"details or response, just a simple A or B response."
|
|
45
|
+
)
|
|
46
|
+
data[prompt] = answer
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-BMT-dataset_filtered.csv"
|
|
51
|
+
|
|
52
|
+
instances: List[Instance] = []
|
|
53
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
54
|
+
|
|
55
|
+
for prompt, answer in benchmark_data.items():
|
|
56
|
+
assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
57
|
+
references: List[Reference] = [
|
|
58
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
59
|
+
for pred_answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
60
|
+
]
|
|
61
|
+
instances.append(
|
|
62
|
+
Instance(
|
|
63
|
+
input=Input(text=prompt),
|
|
64
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
65
|
+
split=TEST_SPLIT,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return instances
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCCDIMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
This benchmark dataset was built from Clinical Document Integrity (CDI)
|
|
21
|
+
notes were there are verifications of clinical activities. The idea behind
|
|
22
|
+
it was to assess an LLM capability to answer these questions from previous notes.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_cdi_med"
|
|
26
|
+
description = (
|
|
27
|
+
"A dataset built from Clinical Document Integrity (CDI) notes, to assess "
|
|
28
|
+
"the ability to answer verification questions from previous notes."
|
|
29
|
+
)
|
|
30
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
31
|
+
|
|
32
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
33
|
+
|
|
34
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
35
|
+
data = {}
|
|
36
|
+
with open(csv_path, "r") as file:
|
|
37
|
+
reader = csv.DictReader(file)
|
|
38
|
+
for row in reader:
|
|
39
|
+
question = row["prompt"]
|
|
40
|
+
context = row["context"]
|
|
41
|
+
answer = row["label"]
|
|
42
|
+
prompt = (
|
|
43
|
+
f"Provide an answer to the following {question} with the following context: {context} , "
|
|
44
|
+
"Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
|
|
45
|
+
"additional details or response, just a simple A or B response."
|
|
46
|
+
)
|
|
47
|
+
data[prompt] = answer
|
|
48
|
+
return data
|
|
49
|
+
|
|
50
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
51
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CDI-dataset_filtered.csv"
|
|
52
|
+
|
|
53
|
+
instances: List[Instance] = []
|
|
54
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
55
|
+
|
|
56
|
+
for prompt, answer in benchmark_data.items():
|
|
57
|
+
assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
58
|
+
references: List[Reference] = [
|
|
59
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
60
|
+
for pred_answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
61
|
+
]
|
|
62
|
+
instances.append(
|
|
63
|
+
Instance(
|
|
64
|
+
input=Input(text=prompt),
|
|
65
|
+
references=references,
|
|
66
|
+
split=TEST_SPLIT,
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return instances
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCCONFMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
Benchmark derived from extracting confidential information from clinical notes.
|
|
21
|
+
From Evaluation of a Large Language Model to Identify Confidential Content in
|
|
22
|
+
Adolescent Encounter Notes published at https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_conf_med"
|
|
26
|
+
description = (
|
|
27
|
+
"A dataset of clinical notes from adolescent patients used to identify sensitive "
|
|
28
|
+
"protected health information that should be restricted from parental access."
|
|
29
|
+
)
|
|
30
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
31
|
+
|
|
32
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
33
|
+
|
|
34
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
35
|
+
data = {}
|
|
36
|
+
with open(csv_path, "r") as file:
|
|
37
|
+
reader = csv.DictReader(file)
|
|
38
|
+
for row in reader:
|
|
39
|
+
question = row["prompt"]
|
|
40
|
+
context = row["context"]
|
|
41
|
+
answer = row["label"]
|
|
42
|
+
prompt = (
|
|
43
|
+
f"Provide an answer to the following {question} with the following context: {context} "
|
|
44
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
45
|
+
"details or response, just a simple A or B response."
|
|
46
|
+
)
|
|
47
|
+
data[prompt] = answer
|
|
48
|
+
return data
|
|
49
|
+
|
|
50
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
51
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CONF-dataset_filtered.csv"
|
|
52
|
+
|
|
53
|
+
instances: List[Instance] = []
|
|
54
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
55
|
+
|
|
56
|
+
for prompt, answer in benchmark_data.items():
|
|
57
|
+
assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
58
|
+
references: List[Reference] = [
|
|
59
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
60
|
+
for pred_answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
61
|
+
]
|
|
62
|
+
instances.append(
|
|
63
|
+
Instance(
|
|
64
|
+
input=Input(text=prompt),
|
|
65
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
66
|
+
split=TEST_SPLIT,
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return instances
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCENTMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
This benchmark dataset was built to assess the capabilities "
|
|
21
|
+
"of an LLM for referral to the Ear, Nose and Throat department.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name = "shc_ent_med"
|
|
25
|
+
description = (
|
|
26
|
+
"A dataset designed to evaluate performance in "
|
|
27
|
+
"identifying appropriate patient referrals to Ear, Nose, and Throat specialists."
|
|
28
|
+
)
|
|
29
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
30
|
+
|
|
31
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
|
|
32
|
+
|
|
33
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
34
|
+
data = {}
|
|
35
|
+
counter = 1
|
|
36
|
+
with open(csv_path, "r") as file:
|
|
37
|
+
reader = csv.DictReader(file)
|
|
38
|
+
for row in reader:
|
|
39
|
+
if row["label"] != "": # skip rows with character/encoding issues - 79
|
|
40
|
+
question = row["prompt"]
|
|
41
|
+
context = row["context"]
|
|
42
|
+
answer = row["label"]
|
|
43
|
+
prompt = (
|
|
44
|
+
f"{counter} Provide an answer to the following {question} with the following context:"
|
|
45
|
+
f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
|
|
46
|
+
" Do not provide any additional details or response, just a simple A, B, or C response."
|
|
47
|
+
)
|
|
48
|
+
data[prompt] = answer
|
|
49
|
+
counter = counter + 1
|
|
50
|
+
return data
|
|
51
|
+
|
|
52
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-ENT-dataset_filtered.csv"
|
|
54
|
+
|
|
55
|
+
instances: List[Instance] = []
|
|
56
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
57
|
+
|
|
58
|
+
for prompt, answer in benchmark_data.items():
|
|
59
|
+
assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
60
|
+
references: List[Reference] = [
|
|
61
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
62
|
+
for pred_answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
63
|
+
]
|
|
64
|
+
instances.append(
|
|
65
|
+
Instance(
|
|
66
|
+
input=Input(text=prompt),
|
|
67
|
+
references=references,
|
|
68
|
+
split=TEST_SPLIT,
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return instances
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCGIPMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
This benchmark dataset was built from a patient referral gold-standard set
|
|
21
|
+
to a specialty clinic to verify the ability of LLMs for patient hospice referral purposes.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name = "shc_gip_med"
|
|
25
|
+
description = "A dataset evaluating performance in identifying appropriate patient referrals to hospice care."
|
|
26
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
27
|
+
|
|
28
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
29
|
+
|
|
30
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
31
|
+
data = {}
|
|
32
|
+
with open(csv_path, "r") as file:
|
|
33
|
+
reader = csv.DictReader(file)
|
|
34
|
+
for row in reader:
|
|
35
|
+
question = row["prompt"]
|
|
36
|
+
context = row["context"]
|
|
37
|
+
answer = row["label"]
|
|
38
|
+
prompt = (
|
|
39
|
+
f"Provide an answer to the following {question} with the following context: {context} "
|
|
40
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
41
|
+
"details or response, just a simple A or B response."
|
|
42
|
+
)
|
|
43
|
+
data[prompt] = answer
|
|
44
|
+
return data
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-GIP-dataset_filtered.csv"
|
|
48
|
+
|
|
49
|
+
instances: List[Instance] = []
|
|
50
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
51
|
+
|
|
52
|
+
for prompt, answer in benchmark_data.items():
|
|
53
|
+
assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
54
|
+
references: List[Reference] = [
|
|
55
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
56
|
+
for pred_answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
57
|
+
]
|
|
58
|
+
instances.append(
|
|
59
|
+
Instance(
|
|
60
|
+
input=Input(text=prompt),
|
|
61
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
62
|
+
split=TEST_SPLIT,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
return instances
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCPTBMMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
This dataset contains clinical notes from primary care visit encounters of
|
|
21
|
+
children ages 4-6 years old with ADHD seen at Stanford's community-based primary
|
|
22
|
+
care network, Packard Children's Health Alliance, between 2015-2019. In this classification
|
|
23
|
+
task, the LLM is tasked with classifying whether the note contains clinician recommendation
|
|
24
|
+
for parent training in behavior management, which is the first-line evidence-based treatment
|
|
25
|
+
for young children with ADHD. From publication: https://doi.org/10.1093/jamia/ocae001
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "shc_ptbm_med"
|
|
29
|
+
description = (
|
|
30
|
+
"A dataset that classifies whether a clinical note contains a clinician "
|
|
31
|
+
"recommendation for parent training in behavior management, which is the first-line "
|
|
32
|
+
"evidence-based treatment for young children with ADHD."
|
|
33
|
+
)
|
|
34
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
35
|
+
|
|
36
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
37
|
+
|
|
38
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
39
|
+
data = {}
|
|
40
|
+
with open(csv_path, "r") as file:
|
|
41
|
+
reader = csv.DictReader(file)
|
|
42
|
+
for row in reader:
|
|
43
|
+
question = row["prompt"]
|
|
44
|
+
context = row["context"]
|
|
45
|
+
answer = row["label"]
|
|
46
|
+
prompt = (
|
|
47
|
+
"You are reviewing a clinical note from health records of children with "
|
|
48
|
+
"attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
|
|
49
|
+
f"behavioral therapy. Provide an answer to the following {question} with the "
|
|
50
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
51
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
52
|
+
)
|
|
53
|
+
data[prompt] = answer
|
|
54
|
+
return data
|
|
55
|
+
|
|
56
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
57
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-PTBM-dataset_filtered.csv"
|
|
58
|
+
|
|
59
|
+
instances: List[Instance] = []
|
|
60
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
61
|
+
|
|
62
|
+
for prompt, answer in benchmark_data.items():
|
|
63
|
+
assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
64
|
+
references: List[Reference] = [
|
|
65
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
66
|
+
for pred_answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
67
|
+
]
|
|
68
|
+
instances.append(
|
|
69
|
+
Instance(
|
|
70
|
+
input=Input(text=prompt),
|
|
71
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
72
|
+
split=TEST_SPLIT,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return instances
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCSEIMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
This dataset contains clinical notes from primary care visit encounters
|
|
21
|
+
(in-person/telehealth and telephone) of children ages 6-11 years old with ADHD
|
|
22
|
+
seen at Stanford's community-based primary care network, Packard Children's Health Alliance,
|
|
23
|
+
between 2015-2022. All children in this dataset were prescribed at least once an ADHD
|
|
24
|
+
medication (stimulants or non-stimulants) by a primary care clinician. In this
|
|
25
|
+
classification task, the LLM is tasked with classifying whether the note contains
|
|
26
|
+
documentation of side effect monitoring (recording of absence or presence of
|
|
27
|
+
medication side effects), as recommended in clinical practice guidelines.
|
|
28
|
+
From publication: https://doi.org/10.1542/peds.2024-067223
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
name = "shc_sei_med"
|
|
32
|
+
description = (
|
|
33
|
+
"A dataset that classifies whether a clinical note contains documentation "
|
|
34
|
+
"of side effect monitoring (recording of absence or presence of medication "
|
|
35
|
+
"side effects), as recommended in clinical practice guidelines."
|
|
36
|
+
)
|
|
37
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
38
|
+
|
|
39
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
40
|
+
|
|
41
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
42
|
+
data = {}
|
|
43
|
+
with open(csv_path, "r") as file:
|
|
44
|
+
reader = csv.DictReader(file)
|
|
45
|
+
for row in reader:
|
|
46
|
+
question = row["prompt"]
|
|
47
|
+
context = row["context"]
|
|
48
|
+
answer = row["label"]
|
|
49
|
+
prompt = (
|
|
50
|
+
"You are reviewing a clinical note from health records of children "
|
|
51
|
+
"with attention deficit hyperactivity disorder (ADHD). Given the following "
|
|
52
|
+
"definitions: side Effects Inquiry (SEI): Explicit documentation by the clinician "
|
|
53
|
+
"asking about current side effects related to ADHD medications that the child is "
|
|
54
|
+
"taking or documentation of specific ADHD medication side effects experienced "
|
|
55
|
+
"by the patient. SEI does *not* include future side effects monitoring, "
|
|
56
|
+
"such as documentation of potential ADHD medication side effects, including "
|
|
57
|
+
"planning to follow patients to monitor side effects, explaining about "
|
|
58
|
+
"potential side effects of an ADHD medication. These documentations are not "
|
|
59
|
+
"categorized as SEI because they consist of a plan or an explanation about "
|
|
60
|
+
"side effects without actual side effect monitoring taking place, and "
|
|
61
|
+
"No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
|
|
62
|
+
f"Provide an answer to the following {question} with the following context: {context} "
|
|
63
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
64
|
+
"details or response, just a simple A or B response."
|
|
65
|
+
)
|
|
66
|
+
data[prompt] = answer
|
|
67
|
+
return data
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-SEI-dataset_filtered.csv"
|
|
71
|
+
|
|
72
|
+
instances: List[Instance] = []
|
|
73
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
74
|
+
|
|
75
|
+
for prompt, answer in benchmark_data.items():
|
|
76
|
+
assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
77
|
+
references: List[Reference] = [
|
|
78
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
79
|
+
for pred_answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
80
|
+
]
|
|
81
|
+
instances.append(
|
|
82
|
+
Instance(
|
|
83
|
+
input=Input(text=prompt),
|
|
84
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
85
|
+
split=TEST_SPLIT,
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return instances
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
csv.field_size_limit(sys.maxsize)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SHCSequoiaMedScenario(Scenario):
|
|
19
|
+
"""
|
|
20
|
+
Benchmark derived from manually curated answers to several questions for Sequoia clinic referrals
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "shc_sequoia_med"
|
|
24
|
+
description = (
|
|
25
|
+
"A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic."
|
|
26
|
+
)
|
|
27
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
28
|
+
|
|
29
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
30
|
+
|
|
31
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
32
|
+
data = {}
|
|
33
|
+
counter = 1
|
|
34
|
+
with open(csv_path, "r") as file:
|
|
35
|
+
reader = csv.DictReader(file) # , quoting=csv.QUOTE_MINIMAL
|
|
36
|
+
for row in reader:
|
|
37
|
+
question = row["question"]
|
|
38
|
+
context = row["context"]
|
|
39
|
+
answer = row["label"]
|
|
40
|
+
prompt = (
|
|
41
|
+
f" {counter} Provide an answer to the following {question} with the following context:"
|
|
42
|
+
f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
|
|
43
|
+
"additional details or response, just a simple A or B response."
|
|
44
|
+
)
|
|
45
|
+
data[prompt] = answer
|
|
46
|
+
counter += 1
|
|
47
|
+
return data
|
|
48
|
+
|
|
49
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
50
|
+
data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-sequoia-dataset_filtered.csv"
|
|
51
|
+
|
|
52
|
+
instances: List[Instance] = []
|
|
53
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
54
|
+
|
|
55
|
+
for prompt, answer in benchmark_data.items():
|
|
56
|
+
assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
57
|
+
references: List[Reference] = [
|
|
58
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
59
|
+
for pred_answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
60
|
+
]
|
|
61
|
+
instances.append(
|
|
62
|
+
Instance(
|
|
63
|
+
input=Input(text=prompt),
|
|
64
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
65
|
+
split=TEST_SPLIT,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return instances
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class SimpleSafetyTestsScenario(Scenario):
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from filelock import FileLock
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
|
|
10
|
+
generate_schema_prompt,
|
|
11
|
+
)
|
|
12
|
+
from helm.benchmark.scenarios.scenario import (
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Scenario,
|
|
15
|
+
Instance,
|
|
16
|
+
Reference,
|
|
17
|
+
VALID_SPLIT,
|
|
18
|
+
Input,
|
|
19
|
+
Output,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ensure_file_unzipped(source_path: str, target_path: str):
|
|
24
|
+
with FileLock(f"{target_path}.lock"):
|
|
25
|
+
if os.path.exists(target_path):
|
|
26
|
+
hlog(f"Not decompressing {source_path} because {target_path} already exists")
|
|
27
|
+
return
|
|
28
|
+
tmp_path = target_path + ".tmp"
|
|
29
|
+
ensure_directory_exists(tmp_path)
|
|
30
|
+
shell(["unzip", source_path, "-d", tmp_path])
|
|
31
|
+
shell(["mv", tmp_path, target_path])
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SpiderScenario(Scenario):
|
|
35
|
+
"""Spider 1.0"""
|
|
36
|
+
|
|
37
|
+
name = "spider"
|
|
38
|
+
description = "spider"
|
|
39
|
+
tags = ["sql"]
|
|
40
|
+
|
|
41
|
+
INSTRUCTIONS_PROMPT = """-- Using valid SQLite, answer the following questions for the tables provided above.
|
|
42
|
+
"""
|
|
43
|
+
COT_PROMPT = """
|
|
44
|
+
Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
|
|
45
|
+
|
|
46
|
+
<reasoning>
|
|
47
|
+
INSERT_YOUR_REASONING_HERE
|
|
48
|
+
</reasoning>
|
|
49
|
+
<sql>
|
|
50
|
+
INSERT_YOUR_SQL_QUERY_HERE
|
|
51
|
+
</sql>""" # noqa: E501
|
|
52
|
+
|
|
53
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
54
|
+
data_parent_path = os.path.join(output_path, "data")
|
|
55
|
+
ensure_file_downloaded(
|
|
56
|
+
"https://drive.google.com/uc?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&export=download&confirm=t",
|
|
57
|
+
data_parent_path,
|
|
58
|
+
unpack=True,
|
|
59
|
+
unpack_type="unzip",
|
|
60
|
+
)
|
|
61
|
+
data_root_path = os.path.join(data_parent_path, "spider_data")
|
|
62
|
+
databases_root_path = os.path.join(data_root_path, "test_database")
|
|
63
|
+
|
|
64
|
+
database_schema_prompts: Dict[str, str] = {}
|
|
65
|
+
for database_name in os.listdir(databases_root_path):
|
|
66
|
+
database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
|
|
67
|
+
if not os.path.exists(database_path):
|
|
68
|
+
# Ignore stray ".DS_Store" directory
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
|
|
72
|
+
database_schema_prompts[database_name] = database_schema_prompt
|
|
73
|
+
|
|
74
|
+
instances: List[Instance] = []
|
|
75
|
+
dataset_path = os.path.join(data_root_path, "test.json")
|
|
76
|
+
dataset = json.load(open(dataset_path, "r"))
|
|
77
|
+
for row in dataset:
|
|
78
|
+
database_id: str = row["db_id"]
|
|
79
|
+
question: str = row["question"]
|
|
80
|
+
gold_sql: str = row["query"]
|
|
81
|
+
|
|
82
|
+
schema_prompt = database_schema_prompts[database_id]
|
|
83
|
+
combined_prompt = schema_prompt + "\n\n" + self.INSTRUCTIONS_PROMPT + question + self.COT_PROMPT
|
|
84
|
+
instance = Instance(
|
|
85
|
+
input=Input(text=combined_prompt),
|
|
86
|
+
references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
|
|
87
|
+
extra_data={"db_id": row["db_id"]},
|
|
88
|
+
split=VALID_SPLIT,
|
|
89
|
+
)
|
|
90
|
+
instances.append(instance)
|
|
91
|
+
return instances
|