crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCBMTMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This benchmark dataset was built from a patient status gold-standard
|
|
22
|
+
for specific questions asked after a bone marrow transplant has taken place.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_bmt_med"
|
|
26
|
+
description = (
|
|
27
|
+
"BMT-Status is a benchmark composed of clinical notes and associated binary questions"
|
|
28
|
+
"related to bone marrow transplant (BMT), hematopoietic stem cell transplant (HSCT),"
|
|
29
|
+
"or hematopoietic cell transplant (HCT) status. The goal is to determine whether the"
|
|
30
|
+
"patient received a subsequent transplant based on the provided clinical documentation."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
50
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
51
|
+
"details or response, just a simple A or B response."
|
|
52
|
+
)
|
|
53
|
+
data[prompt] = answer
|
|
54
|
+
return data
|
|
55
|
+
|
|
56
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
57
|
+
check_file_exists(self.data_path, msg=f"[SHCBMTMedScenario] Required data file not found: '{self.data_path}'")
|
|
58
|
+
instances: List[Instance] = []
|
|
59
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
60
|
+
|
|
61
|
+
for prompt, answer in benchmark_data.items():
|
|
62
|
+
assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
63
|
+
references: List[Reference] = [
|
|
64
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
65
|
+
for pred_answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
66
|
+
]
|
|
67
|
+
instances.append(
|
|
68
|
+
Instance(
|
|
69
|
+
input=Input(text=prompt),
|
|
70
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
71
|
+
split=TEST_SPLIT,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return instances
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCCDIMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This benchmark dataset was built from Clinical Document Integrity (CDI)
|
|
22
|
+
notes were there are verifications of clinical activities. The idea behind
|
|
23
|
+
it was to assess an LLM capability to answer these questions from previous notes.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "shc_cdi_med"
|
|
27
|
+
description = (
|
|
28
|
+
"CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI)"
|
|
29
|
+
"notes. It is used to evaluate a model's ability to verify clinical conditions based on"
|
|
30
|
+
"documented evidence in patient records."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
f"Provide an answer to the following question: {question} with the following context: {context} , "
|
|
50
|
+
"Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
|
|
51
|
+
"additional details or response, just a simple A or B response."
|
|
52
|
+
)
|
|
53
|
+
data[prompt] = answer
|
|
54
|
+
return data
|
|
55
|
+
|
|
56
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
57
|
+
check_file_exists(self.data_path, msg=f"[SHCCDIMedScenario] Required data file not found: '{self.data_path}'")
|
|
58
|
+
instances: List[Instance] = []
|
|
59
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
60
|
+
|
|
61
|
+
for prompt, answer in benchmark_data.items():
|
|
62
|
+
assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
63
|
+
references: List[Reference] = [
|
|
64
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
65
|
+
for pred_answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
66
|
+
]
|
|
67
|
+
instances.append(
|
|
68
|
+
Instance(
|
|
69
|
+
input=Input(text=prompt),
|
|
70
|
+
references=references,
|
|
71
|
+
split=TEST_SPLIT,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return instances
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCCONFMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
Benchmark derived from extracting confidential information from clinical notes.
|
|
22
|
+
From Evaluation of a Large Language Model to Identify Confidential Content in
|
|
23
|
+
Adolescent Encounter Notes published at https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "shc_conf_med"
|
|
27
|
+
description = (
|
|
28
|
+
"MedConfInfo is a benchmark comprising clinical notes from adolescent patients. It is"
|
|
29
|
+
"used to evaluate whether the content contains sensitive protected health information"
|
|
30
|
+
"(PHI) that should be restricted from parental access, in accordance with adolescent"
|
|
31
|
+
"confidentiality policies in clinical care."
|
|
32
|
+
)
|
|
33
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
34
|
+
|
|
35
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
36
|
+
|
|
37
|
+
def __init__(self, data_path: str):
|
|
38
|
+
super().__init__()
|
|
39
|
+
self.data_path = data_path
|
|
40
|
+
|
|
41
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
42
|
+
data = {}
|
|
43
|
+
with open(csv_path, "r") as file:
|
|
44
|
+
reader = csv.DictReader(file)
|
|
45
|
+
for row in reader:
|
|
46
|
+
question = row["prompt"]
|
|
47
|
+
context = row["context"]
|
|
48
|
+
answer = row["label"]
|
|
49
|
+
prompt = (
|
|
50
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
51
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
52
|
+
"details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(self.data_path, msg=f"[SHCCONFMedScenario] Required data file not found: '{self.data_path}'")
|
|
59
|
+
instances: List[Instance] = []
|
|
60
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
61
|
+
|
|
62
|
+
for prompt, answer in benchmark_data.items():
|
|
63
|
+
assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
64
|
+
references: List[Reference] = [
|
|
65
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
66
|
+
for pred_answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
67
|
+
]
|
|
68
|
+
instances.append(
|
|
69
|
+
Instance(
|
|
70
|
+
input=Input(text=prompt),
|
|
71
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
72
|
+
split=TEST_SPLIT,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return instances
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCENTMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This benchmark dataset was built to assess the capabilities "
|
|
22
|
+
"of an LLM for referral to the Ear, Nose and Throat department.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_ent_med"
|
|
26
|
+
description = (
|
|
27
|
+
"ENT-Referral is a benchmark designed to evaluate whether a patient's clinical note"
|
|
28
|
+
"supports a referral to an Ear, Nose, and Throat (ENT) specialist. It helps assess"
|
|
29
|
+
"models' abilities to make referral decisions based on unstructured clinical text."
|
|
30
|
+
)
|
|
31
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
32
|
+
|
|
33
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
|
|
34
|
+
|
|
35
|
+
def __init__(self, data_path: str):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.data_path = data_path
|
|
38
|
+
|
|
39
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
40
|
+
data = {}
|
|
41
|
+
counter = 1
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
if row["label"] != "": # skip rows with character/encoding issues - 79
|
|
46
|
+
question = row["prompt"]
|
|
47
|
+
context = row["context"]
|
|
48
|
+
answer = row["label"]
|
|
49
|
+
prompt = (
|
|
50
|
+
f"{counter} Provide an answer to the following question: {question} with the following context:"
|
|
51
|
+
f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
|
|
52
|
+
" Do not provide any additional details or response, just a simple A, B, or C response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
counter = counter + 1
|
|
56
|
+
return data
|
|
57
|
+
|
|
58
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
59
|
+
check_file_exists(self.data_path, msg=f"[SHCENTMedScenario] Required data file not found: '{self.data_path}'")
|
|
60
|
+
instances: List[Instance] = []
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
62
|
+
|
|
63
|
+
for prompt, answer in benchmark_data.items():
|
|
64
|
+
assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
65
|
+
references: List[Reference] = [
|
|
66
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
67
|
+
for pred_answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
68
|
+
]
|
|
69
|
+
instances.append(
|
|
70
|
+
Instance(
|
|
71
|
+
input=Input(text=prompt),
|
|
72
|
+
references=references,
|
|
73
|
+
split=TEST_SPLIT,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return instances
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCGIPMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This benchmark dataset was built from a patient referral gold-standard set
|
|
22
|
+
to a specialty clinic to verify the ability of LLMs for patient hospice referral purposes.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_gip_med"
|
|
26
|
+
description = (
|
|
27
|
+
"HospiceReferral is a benchmark that evaluates model performance in identifying"
|
|
28
|
+
"whether patients are eligible for hospice care based on palliative care clinical notes."
|
|
29
|
+
"The benchmark focuses on end-of-life care referral decisions."
|
|
30
|
+
)
|
|
31
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
32
|
+
|
|
33
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
34
|
+
|
|
35
|
+
def __init__(self, data_path: str):
|
|
36
|
+
super().__init__()
|
|
37
|
+
self.data_path = data_path
|
|
38
|
+
|
|
39
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
40
|
+
data = {}
|
|
41
|
+
with open(csv_path, "r") as file:
|
|
42
|
+
reader = csv.DictReader(file)
|
|
43
|
+
for row in reader:
|
|
44
|
+
question = row["prompt"]
|
|
45
|
+
context = row["context"]
|
|
46
|
+
answer = row["label"]
|
|
47
|
+
prompt = (
|
|
48
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
49
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
50
|
+
"details or response, just a simple A or B response."
|
|
51
|
+
)
|
|
52
|
+
data[prompt] = answer
|
|
53
|
+
return data
|
|
54
|
+
|
|
55
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
56
|
+
check_file_exists(self.data_path, msg=f"[SHCGIPMedScenario] Required data file not found: '{self.data_path}'")
|
|
57
|
+
instances: List[Instance] = []
|
|
58
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
59
|
+
|
|
60
|
+
for prompt, answer in benchmark_data.items():
|
|
61
|
+
assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
62
|
+
references: List[Reference] = [
|
|
63
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
64
|
+
for pred_answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
65
|
+
]
|
|
66
|
+
instances.append(
|
|
67
|
+
Instance(
|
|
68
|
+
input=Input(text=prompt),
|
|
69
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
70
|
+
split=TEST_SPLIT,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
return instances
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCPRIVACYMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset features messages sent generated by an LLM from patient clinical notes data.
|
|
22
|
+
The scenario evaluates the ability of an LLM to determine if any potentially confidential
|
|
23
|
+
information about the patient was included. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "shc_privacy_med"
|
|
27
|
+
description = (
|
|
28
|
+
"PrivacyDetection is a benchmark composed of patient portal messages submitted by"
|
|
29
|
+
"patients or caregivers. The task is to determine whether the message contains any"
|
|
30
|
+
"confidential or privacy-leaking information that should be protected."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
"You are reviewing clinical messages in order to determine if any confidential "
|
|
50
|
+
f"information was included. Please answer the following question: {question} with the "
|
|
51
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
52
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(
|
|
59
|
+
self.data_path, msg=f"[SHCPRIVACYMedScenario] Required data file not found: '{self.data_path}'"
|
|
60
|
+
)
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
63
|
+
|
|
64
|
+
for prompt, answer in benchmark_data.items():
|
|
65
|
+
assert answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
66
|
+
references: List[Reference] = [
|
|
67
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
68
|
+
for pred_answer in SHCPRIVACYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
69
|
+
]
|
|
70
|
+
instances.append(
|
|
71
|
+
Instance(
|
|
72
|
+
input=Input(text=prompt),
|
|
73
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
74
|
+
split=TEST_SPLIT,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return instances
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCPROXYMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset features messages sent by proxy users and non proxy users, for evaluation of
|
|
22
|
+
LLM capabilities to determine the sender. From publication: https://doi.org/10.1001/jamapediatrics.2024.4438
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "shc_proxy_med"
|
|
26
|
+
description = (
|
|
27
|
+
"ProxySender is a benchmark composed of patient portal messages received by clinicians."
|
|
28
|
+
"It evaluates whether the message was sent by the patient or by a proxy user (e.g., parent,"
|
|
29
|
+
"spouse), which is critical for understanding who is communicating with healthcare"
|
|
30
|
+
"providers."
|
|
31
|
+
)
|
|
32
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
33
|
+
|
|
34
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
35
|
+
|
|
36
|
+
def __init__(self, data_path: str):
|
|
37
|
+
super().__init__()
|
|
38
|
+
self.data_path = data_path
|
|
39
|
+
|
|
40
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
41
|
+
data = {}
|
|
42
|
+
with open(csv_path, "r") as file:
|
|
43
|
+
reader = csv.DictReader(file)
|
|
44
|
+
for row in reader:
|
|
45
|
+
question = row["prompt"]
|
|
46
|
+
context = row["context"]
|
|
47
|
+
answer = row["label"]
|
|
48
|
+
prompt = (
|
|
49
|
+
"You are reviewing a clinical messages in order to determine if they have been "
|
|
50
|
+
f"sent by a proxy user. Please determine the following: {question} with the "
|
|
51
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
52
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
53
|
+
)
|
|
54
|
+
data[prompt] = answer
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
58
|
+
check_file_exists(self.data_path, msg=f"[SHCPROXYMedScenario] Required data file not found: '{self.data_path}'")
|
|
59
|
+
instances: List[Instance] = []
|
|
60
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
61
|
+
|
|
62
|
+
for prompt, answer in benchmark_data.items():
|
|
63
|
+
assert answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
64
|
+
references: List[Reference] = [
|
|
65
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
66
|
+
for pred_answer in SHCPROXYMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
67
|
+
]
|
|
68
|
+
instances.append(
|
|
69
|
+
Instance(
|
|
70
|
+
input=Input(text=prompt),
|
|
71
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
72
|
+
split=TEST_SPLIT,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return instances
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCPTBMMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset contains clinical notes from primary care visit encounters of
|
|
22
|
+
children ages 4-6 years old with ADHD seen at Stanford's community-based primary
|
|
23
|
+
care network, Packard Children's Health Alliance, between 2015-2019. In this classification
|
|
24
|
+
task, the LLM is tasked with classifying whether the note contains clinician recommendation
|
|
25
|
+
for parent training in behavior management, which is the first-line evidence-based treatment
|
|
26
|
+
for young children with ADHD. From publication: https://doi.org/10.1093/jamia/ocae001
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
name = "shc_ptbm_med"
|
|
30
|
+
description = (
|
|
31
|
+
"ADHD-Behavior is a benchmark that evaluates a model’s ability to detect whether"
|
|
32
|
+
"a clinician recommends parent training in behavior management, an evidence-based"
|
|
33
|
+
"first-line treatment for young children diagnosed with ADHD. Each instance includes"
|
|
34
|
+
"a clinical note from a pediatric visit and a binary classification task."
|
|
35
|
+
)
|
|
36
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
37
|
+
|
|
38
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
39
|
+
|
|
40
|
+
def __init__(self, data_path: str):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.data_path = data_path
|
|
43
|
+
|
|
44
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
45
|
+
data = {}
|
|
46
|
+
with open(csv_path, "r") as file:
|
|
47
|
+
reader = csv.DictReader(file)
|
|
48
|
+
for row in reader:
|
|
49
|
+
question = row["prompt"]
|
|
50
|
+
context = row["context"]
|
|
51
|
+
answer = row["label"]
|
|
52
|
+
prompt = (
|
|
53
|
+
"You are reviewing a clinical note from health records of children with "
|
|
54
|
+
"attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
|
|
55
|
+
f"behavioral therapy. Provide an answer to the following question: {question} with the "
|
|
56
|
+
f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
|
|
57
|
+
"for no. Do not provide any additional details or response, just a simple A or B response."
|
|
58
|
+
)
|
|
59
|
+
data[prompt] = answer
|
|
60
|
+
return data
|
|
61
|
+
|
|
62
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
63
|
+
check_file_exists(self.data_path, msg=f"[SHCPTBMMedScenario] Required data file not found: '{self.data_path}'")
|
|
64
|
+
instances: List[Instance] = []
|
|
65
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
66
|
+
|
|
67
|
+
for prompt, answer in benchmark_data.items():
|
|
68
|
+
assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
69
|
+
references: List[Reference] = [
|
|
70
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
71
|
+
for pred_answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
72
|
+
]
|
|
73
|
+
instances.append(
|
|
74
|
+
Instance(
|
|
75
|
+
input=Input(text=prompt),
|
|
76
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
77
|
+
split=TEST_SPLIT,
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return instances
|