crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCSEIMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
This dataset contains clinical notes from primary care visit encounters
|
|
22
|
+
(in-person/telehealth and telephone) of children ages 6-11 years old with ADHD
|
|
23
|
+
seen at Stanford's community-based primary care network, Packard Children's Health Alliance,
|
|
24
|
+
between 2015-2022. All children in this dataset were prescribed at least once an ADHD
|
|
25
|
+
medication (stimulants or non-stimulants) by a primary care clinician. In this
|
|
26
|
+
classification task, the LLM is tasked with classifying whether the note contains
|
|
27
|
+
documentation of side effect monitoring (recording of absence or presence of
|
|
28
|
+
medication side effects), as recommended in clinical practice guidelines.
|
|
29
|
+
From publication: https://doi.org/10.1542/peds.2024-067223
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
name = "shc_sei_med"
|
|
33
|
+
description = (
|
|
34
|
+
"ADHD-MedEffects is a benchmark designed to evaluate whether clinical notes for"
|
|
35
|
+
"pediatric ADHD visits document medication side effect monitoring, which is a key recommendation"
|
|
36
|
+
"in clinical practice guidelines. The dataset supports binary classification"
|
|
37
|
+
"to detect presence or absence of side effect inquiries (SEI) within notes."
|
|
38
|
+
)
|
|
39
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
40
|
+
|
|
41
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
42
|
+
|
|
43
|
+
def __init__(self, data_path: str):
|
|
44
|
+
super().__init__()
|
|
45
|
+
self.data_path = data_path
|
|
46
|
+
|
|
47
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
48
|
+
data = {}
|
|
49
|
+
with open(csv_path, "r") as file:
|
|
50
|
+
reader = csv.DictReader(file)
|
|
51
|
+
for row in reader:
|
|
52
|
+
question = row["prompt"]
|
|
53
|
+
context = row["context"]
|
|
54
|
+
answer = row["label"]
|
|
55
|
+
prompt = (
|
|
56
|
+
"You are reviewing a clinical note from health records of children "
|
|
57
|
+
"with attention deficit hyperactivity disorder (ADHD). Given the following "
|
|
58
|
+
"definitions: side Effects Inquiry (SEI): Explicit documentation by the clinician "
|
|
59
|
+
"asking about current side effects related to ADHD medications that the child is "
|
|
60
|
+
"taking or documentation of specific ADHD medication side effects experienced "
|
|
61
|
+
"by the patient. SEI does *not* include future side effects monitoring, "
|
|
62
|
+
"such as documentation of potential ADHD medication side effects, including "
|
|
63
|
+
"planning to follow patients to monitor side effects, explaining about "
|
|
64
|
+
"potential side effects of an ADHD medication. These documentations are not "
|
|
65
|
+
"categorized as SEI because they consist of a plan or an explanation about "
|
|
66
|
+
"side effects without actual side effect monitoring taking place, and "
|
|
67
|
+
"No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
|
|
68
|
+
f"Provide an answer to the following question: {question} with the following context: {context} "
|
|
69
|
+
", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
|
|
70
|
+
"details or response, just a simple A or B response."
|
|
71
|
+
)
|
|
72
|
+
data[prompt] = answer
|
|
73
|
+
return data
|
|
74
|
+
|
|
75
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
76
|
+
check_file_exists(self.data_path, msg=f"[SHCSEIMedScenario] Required data file not found: '{self.data_path}'")
|
|
77
|
+
instances: List[Instance] = []
|
|
78
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
79
|
+
|
|
80
|
+
for prompt, answer in benchmark_data.items():
|
|
81
|
+
assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
82
|
+
references: List[Reference] = [
|
|
83
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
84
|
+
for pred_answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
85
|
+
]
|
|
86
|
+
instances.append(
|
|
87
|
+
Instance(
|
|
88
|
+
input=Input(text=prompt),
|
|
89
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
90
|
+
split=TEST_SPLIT,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return instances
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import csv
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import check_file_exists
|
|
15
|
+
|
|
16
|
+
csv.field_size_limit(sys.maxsize)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SHCSequoiaMedScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
Benchmark derived from manually curated answers to several questions for Sequoia clinic referrals
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
name = "shc_sequoia_med"
|
|
25
|
+
description = (
|
|
26
|
+
"ClinicReferral is a benchmark that determines patient eligibility for referral to the"
|
|
27
|
+
"Sequoia Clinic based on information from palliative care notes. The dataset provides"
|
|
28
|
+
"curated decisions on referral appropriateness to assist in automating clinic workflows."
|
|
29
|
+
)
|
|
30
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
31
|
+
|
|
32
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
|
|
33
|
+
|
|
34
|
+
def __init__(self, data_path: str):
|
|
35
|
+
super().__init__()
|
|
36
|
+
self.data_path = data_path
|
|
37
|
+
|
|
38
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
39
|
+
data = {}
|
|
40
|
+
counter = 1
|
|
41
|
+
with open(csv_path, "r") as file:
|
|
42
|
+
reader = csv.DictReader(file) # , quoting=csv.QUOTE_MINIMAL
|
|
43
|
+
for row in reader:
|
|
44
|
+
question = row["question"]
|
|
45
|
+
context = row["context"]
|
|
46
|
+
answer = row["label"]
|
|
47
|
+
prompt = (
|
|
48
|
+
f" {counter} Provide an answer to the following question: {question} with the following context:"
|
|
49
|
+
f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
|
|
50
|
+
"additional details or response, just a simple A or B response."
|
|
51
|
+
)
|
|
52
|
+
data[prompt] = answer
|
|
53
|
+
counter += 1
|
|
54
|
+
return data
|
|
55
|
+
|
|
56
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
57
|
+
check_file_exists(
|
|
58
|
+
self.data_path, msg=f"[SHCSequoiaMedScenario] Required data file not found: '{self.data_path}'"
|
|
59
|
+
)
|
|
60
|
+
instances: List[Instance] = []
|
|
61
|
+
benchmark_data = self.create_benchmark(self.data_path)
|
|
62
|
+
|
|
63
|
+
for prompt, answer in benchmark_data.items():
|
|
64
|
+
assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
65
|
+
references: List[Reference] = [
|
|
66
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
67
|
+
for pred_answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
68
|
+
]
|
|
69
|
+
instances.append(
|
|
70
|
+
Instance(
|
|
71
|
+
input=Input(text=prompt),
|
|
72
|
+
references=references, # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
73
|
+
split=TEST_SPLIT,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return instances
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class SimpleSafetyTestsScenario(Scenario):
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from filelock import FileLock
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
|
|
10
|
+
generate_schema_prompt,
|
|
11
|
+
)
|
|
12
|
+
from helm.benchmark.scenarios.scenario import (
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Scenario,
|
|
15
|
+
Instance,
|
|
16
|
+
Reference,
|
|
17
|
+
VALID_SPLIT,
|
|
18
|
+
Input,
|
|
19
|
+
Output,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ensure_file_unzipped(source_path: str, target_path: str):
|
|
24
|
+
with FileLock(f"{target_path}.lock"):
|
|
25
|
+
if os.path.exists(target_path):
|
|
26
|
+
hlog(f"Not decompressing {source_path} because {target_path} already exists")
|
|
27
|
+
return
|
|
28
|
+
tmp_path = target_path + ".tmp"
|
|
29
|
+
ensure_directory_exists(tmp_path)
|
|
30
|
+
shell(["unzip", source_path, "-d", tmp_path])
|
|
31
|
+
shell(["mv", tmp_path, target_path])
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SpiderScenario(Scenario):
|
|
35
|
+
"""Spider 1.0"""
|
|
36
|
+
|
|
37
|
+
name = "spider"
|
|
38
|
+
description = "spider"
|
|
39
|
+
tags = ["sql"]
|
|
40
|
+
|
|
41
|
+
INSTRUCTIONS_PROMPT = """-- Using valid SQLite, answer the following questions for the tables provided above.
|
|
42
|
+
"""
|
|
43
|
+
COT_PROMPT = """
|
|
44
|
+
Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
|
|
45
|
+
|
|
46
|
+
<reasoning>
|
|
47
|
+
INSERT_YOUR_REASONING_HERE
|
|
48
|
+
</reasoning>
|
|
49
|
+
<sql>
|
|
50
|
+
INSERT_YOUR_SQL_QUERY_HERE
|
|
51
|
+
</sql>""" # noqa: E501
|
|
52
|
+
|
|
53
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
54
|
+
data_parent_path = os.path.join(output_path, "data")
|
|
55
|
+
ensure_file_downloaded(
|
|
56
|
+
"https://drive.google.com/uc?id=1403EGqzIDoHMdQF4c9Bkyl7dZLZ5Wt6J&export=download&confirm=t",
|
|
57
|
+
data_parent_path,
|
|
58
|
+
unpack=True,
|
|
59
|
+
unpack_type="unzip",
|
|
60
|
+
)
|
|
61
|
+
data_root_path = os.path.join(data_parent_path, "spider_data")
|
|
62
|
+
databases_root_path = os.path.join(data_root_path, "test_database")
|
|
63
|
+
|
|
64
|
+
database_schema_prompts: Dict[str, str] = {}
|
|
65
|
+
for database_name in os.listdir(databases_root_path):
|
|
66
|
+
database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
|
|
67
|
+
if not os.path.exists(database_path):
|
|
68
|
+
# Ignore stray ".DS_Store" directory
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
|
|
72
|
+
database_schema_prompts[database_name] = database_schema_prompt
|
|
73
|
+
|
|
74
|
+
instances: List[Instance] = []
|
|
75
|
+
dataset_path = os.path.join(data_root_path, "test.json")
|
|
76
|
+
dataset = json.load(open(dataset_path, "r"))
|
|
77
|
+
for row in dataset:
|
|
78
|
+
database_id: str = row["db_id"]
|
|
79
|
+
question: str = row["question"]
|
|
80
|
+
gold_sql: str = row["query"]
|
|
81
|
+
|
|
82
|
+
schema_prompt = database_schema_prompts[database_id]
|
|
83
|
+
combined_prompt = schema_prompt + "\n\n" + self.INSTRUCTIONS_PROMPT + question + self.COT_PROMPT
|
|
84
|
+
instance = Instance(
|
|
85
|
+
input=Input(text=combined_prompt),
|
|
86
|
+
references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
|
|
87
|
+
extra_data={"db_id": row["db_id"]},
|
|
88
|
+
split=VALID_SPLIT,
|
|
89
|
+
)
|
|
90
|
+
instances.append(instance)
|
|
91
|
+
return instances
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from helm.common.general import check_file_exists
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Input,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StarrPatientInstructionsScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
Starr Patient Instructions is a dataset created from STARR-OMOP data, containing after-visit instructions
|
|
19
|
+
for outpatient surgeries/procedures. Each example corresponds to one surgery or procedure case (only including
|
|
20
|
+
outpatient or observation/overnight cases with discharge within 24 hours) and includes the following fields:
|
|
21
|
+
|
|
22
|
+
- Diagnosis: Why the patient needs the surgery/procedure.
|
|
23
|
+
- ActualProcedure: The surgery/procedure name.
|
|
24
|
+
- HistoryPhysicalNoteText: The History & Physical note written by the surgeon.
|
|
25
|
+
- OperativeNoteText: The report describing what was done during the surgery/procedure.
|
|
26
|
+
- DischargeInstructionNoteText: The specific after-surgery care instructions given to the patient.
|
|
27
|
+
|
|
28
|
+
The task is to generate personalized post-procedure patient instructions based on the provided case details.
|
|
29
|
+
|
|
30
|
+
Sample Synthetic Prompt:
|
|
31
|
+
Given the following case details, generate personalized after-surgery care instructions.
|
|
32
|
+
|
|
33
|
+
Diagnosis: [diagnosis text]
|
|
34
|
+
Procedure: [actual procedure text]
|
|
35
|
+
History & Physical: [H&P note text]
|
|
36
|
+
Operative Report: [operative note text]
|
|
37
|
+
|
|
38
|
+
Patient Instructions:
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
name = "starr_patient_instructions"
|
|
42
|
+
description = (
|
|
43
|
+
"PatientInstruct is a benchmark designed to evaluate models on generating personalized"
|
|
44
|
+
"post-procedure instructions for patients. It includes real-world patient History & Physical"
|
|
45
|
+
"Note (H&P) and operative report, from which models must produce clear, actionable instructions"
|
|
46
|
+
"appropriate for patients recovering from medical interventions."
|
|
47
|
+
)
|
|
48
|
+
tags = ["patient_communication", "healthcare", "instruction_generation", "surgery"]
|
|
49
|
+
|
|
50
|
+
def __init__(self, data_path: str):
|
|
51
|
+
super().__init__()
|
|
52
|
+
self.data_path = data_path
|
|
53
|
+
|
|
54
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
55
|
+
check_file_exists(
|
|
56
|
+
self.data_path, msg=f"[StarrPatientInstructiosScenario] Required data file not found: '{self.data_path}'"
|
|
57
|
+
)
|
|
58
|
+
instances: List[Instance] = []
|
|
59
|
+
# For now, we assign all instances to the test split (zero-shot setting).
|
|
60
|
+
split = TEST_SPLIT
|
|
61
|
+
|
|
62
|
+
with open(self.data_path, "r", encoding="utf-8") as csvfile:
|
|
63
|
+
reader = csv.DictReader(csvfile)
|
|
64
|
+
for row in reader:
|
|
65
|
+
# Retrieve and strip the relevant fields.
|
|
66
|
+
qc_value = row.get("QC", "").strip().upper()
|
|
67
|
+
if qc_value != "TRUE":
|
|
68
|
+
continue
|
|
69
|
+
diagnosis = row.get("Diagnosis", "").strip()
|
|
70
|
+
actual_procedure = row.get("ActualProcedure", "").strip()
|
|
71
|
+
history_physical = row.get("HistoryPhysicalNoteText", "").strip()
|
|
72
|
+
operative_note = row.get("OperativeNoteText", "").strip()
|
|
73
|
+
discharge_instruction = row.get("DischargeInstructionNoteText", "").strip()
|
|
74
|
+
|
|
75
|
+
# Skip the instance if any required field is missing.
|
|
76
|
+
if not (
|
|
77
|
+
diagnosis and actual_procedure and history_physical and operative_note and discharge_instruction
|
|
78
|
+
):
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
# Construct the input prompt by concatenating the fields.
|
|
82
|
+
input_text = (
|
|
83
|
+
f"Diagnosis: {diagnosis}\n"
|
|
84
|
+
f"Procedure: {actual_procedure}\n"
|
|
85
|
+
f"History & Physical: {history_physical}\n"
|
|
86
|
+
f"Operative Report: {operative_note}\n\n"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
instances.append(
|
|
90
|
+
Instance(
|
|
91
|
+
input=Input(text=input_text),
|
|
92
|
+
references=[Reference(Output(text=discharge_instruction), tags=[CORRECT_TAG])],
|
|
93
|
+
split=split,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return instances
|
|
@@ -3,7 +3,17 @@ import pickle
|
|
|
3
3
|
|
|
4
4
|
from typing import List, Optional
|
|
5
5
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
class SummarizationScenario(Scenario):
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
TRAIN_SPLIT,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
# Needed for pandas.read_excel
|
|
22
|
+
import openpyxl # noqa
|
|
23
|
+
except ModuleNotFoundError as e:
|
|
24
|
+
handle_module_not_found_error(e, ["ibm-enterprise-scenarios"])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SUMOSumScenario(Scenario):
|
|
28
|
+
"""SUMO Web Claims Summarization
|
|
29
|
+
|
|
30
|
+
SUMO Web Claims Summarization is a summarization task over the climate subset from the SUMO dataset.
|
|
31
|
+
The task is to write a title based on the article contents.
|
|
32
|
+
|
|
33
|
+
Citation:
|
|
34
|
+
@inproceedings{mishra-etal-2020-generating,
|
|
35
|
+
title = "Generating Fact Checking Summaries for Web Claims",
|
|
36
|
+
author = "Mishra, Rahul and
|
|
37
|
+
Gupta, Dhruv and
|
|
38
|
+
Leippold, Markus",
|
|
39
|
+
editor = "Xu, Wei and
|
|
40
|
+
Ritter, Alan and
|
|
41
|
+
Baldwin, Tim and
|
|
42
|
+
Rahimi, Afshin",
|
|
43
|
+
booktitle = "Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)",
|
|
44
|
+
month = nov,
|
|
45
|
+
year = "2020",
|
|
46
|
+
address = "Online",
|
|
47
|
+
publisher = "Association for Computational Linguistics",
|
|
48
|
+
url = "https://aclanthology.org/2020.wnut-1.12",
|
|
49
|
+
doi = "10.18653/v1/2020.wnut-1.12",
|
|
50
|
+
pages = "81--90",
|
|
51
|
+
abstract = "We present SUMO, a neural attention-based approach that learns to establish correctness of textual claims based on evidence in the form of text documents (e.g., news articles or web documents). SUMO further generates an extractive summary by presenting a diversified set of sentences from the documents that explain its decision on the correctness of the textual claim. Prior approaches to address the problem of fact checking and evidence extraction have relied on simple concatenation of claim and document word embeddings as an input to claim driven attention weight computation. This is done so as to extract salient words and sentences from the documents that help establish the correctness of the claim. However this design of claim-driven attention fails to capture the contextual information in documents properly. We improve on the prior art by using improved claim and title guided hierarchical attention to model effective contextual cues. We show the efficacy of our approach on political, healthcare, and environmental datasets.",
|
|
52
|
+
}
|
|
53
|
+
""" # noqa: E501
|
|
54
|
+
|
|
55
|
+
name = "sumosum"
|
|
56
|
+
description = "Text summarization with climate corpus"
|
|
57
|
+
tags = ["summarization", "climate"]
|
|
58
|
+
|
|
59
|
+
TRAIN_RATIO = 0.2
|
|
60
|
+
TITLE_KEY = "Title"
|
|
61
|
+
DOCUMENT_KEY = "Doc_text"
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
train_filter_min_length: Optional[int] = None,
|
|
66
|
+
train_filter_max_length: Optional[int] = None,
|
|
67
|
+
test_filter_min_length: Optional[int] = None,
|
|
68
|
+
test_filter_max_length: Optional[int] = None,
|
|
69
|
+
truncate_length: Optional[int] = None,
|
|
70
|
+
):
|
|
71
|
+
"""
|
|
72
|
+
Initializes the scenario.
|
|
73
|
+
Args:
|
|
74
|
+
train_filter_min_length: Int indicating minimum length for training
|
|
75
|
+
documents. Train examples smaller than
|
|
76
|
+
train_filter_min_length tokens will be filtered out.
|
|
77
|
+
train_filter_max_length: Int indicating maximum length for training
|
|
78
|
+
documents. Train examples larger than
|
|
79
|
+
train_filter_max_length tokens will be filtered out.
|
|
80
|
+
test_filter_min_length: Int indicating minimum length for training
|
|
81
|
+
documents. Test examples smaller than
|
|
82
|
+
test_filter_min_length tokens will be filtered out.
|
|
83
|
+
test_filter_max_length: Int indicating maximum length for training
|
|
84
|
+
documents. Test examples larger than
|
|
85
|
+
test_filter_max_length tokens will be filtered out.
|
|
86
|
+
truncate_length: Int indicating the maximum length in tokens to
|
|
87
|
+
truncate documents. Documents in all splits will be
|
|
88
|
+
truncated to truncate_length tokens.
|
|
89
|
+
NOTE: Whitespace tokenization is used to compute tokens.
|
|
90
|
+
"""
|
|
91
|
+
super().__init__()
|
|
92
|
+
self.train_filter_min_length = train_filter_min_length
|
|
93
|
+
self.train_filter_max_length = train_filter_max_length
|
|
94
|
+
self.test_filter_min_length = test_filter_min_length
|
|
95
|
+
self.test_filter_max_length = test_filter_max_length
|
|
96
|
+
self.truncate_length = truncate_length
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def _clean_and_truncate(text: str, max_length: Optional[int] = None) -> str:
|
|
100
|
+
text = re.sub(r"\s+", " ", text)
|
|
101
|
+
return " ".join(text.split()[:max_length])
|
|
102
|
+
|
|
103
|
+
def _load_dataset(self, output_path: str) -> Dict[str, pd.DataFrame]:
|
|
104
|
+
data_dir = os.path.join(output_path, "data")
|
|
105
|
+
ensure_directory_exists(data_dir)
|
|
106
|
+
|
|
107
|
+
source_url = "https://github.com/rahulOmishra/SUMO/raw/main/climate_claims_raw.xlsx"
|
|
108
|
+
source_file = os.path.basename(source_url)
|
|
109
|
+
target_path = os.path.join(data_dir, source_file)
|
|
110
|
+
ensure_file_downloaded(
|
|
111
|
+
source_url=source_url,
|
|
112
|
+
target_path=target_path,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Column headers: Claim_id(int),Claim,Title,Doc_text,Label(bool)
|
|
116
|
+
target_df = pd.read_excel(target_path, skiprows=1)
|
|
117
|
+
target_df = target_df.dropna(subset=[SUMOSumScenario.TITLE_KEY, SUMOSumScenario.DOCUMENT_KEY])
|
|
118
|
+
# Remove carriage return _x000D_ in Excel string
|
|
119
|
+
target_df = target_df.replace({r"_x000D_": ""}, regex=True)
|
|
120
|
+
# target_df = target_df.replace({r"_x([0-9a-fA-F]{4})_": ""}, regex=True)
|
|
121
|
+
# Split randomly (works better than split by order)
|
|
122
|
+
train_df = target_df.sample(frac=SUMOSumScenario.TRAIN_RATIO, random_state=0)
|
|
123
|
+
test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
|
|
124
|
+
return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
|
|
125
|
+
|
|
126
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
127
|
+
dataset_dict = self._load_dataset(output_path)
|
|
128
|
+
|
|
129
|
+
instances: List[Instance] = []
|
|
130
|
+
|
|
131
|
+
for split, split_data in dataset_dict.items():
|
|
132
|
+
for example in split_data.itertuples():
|
|
133
|
+
document = getattr(example, SUMOSumScenario.DOCUMENT_KEY)
|
|
134
|
+
title = getattr(example, SUMOSumScenario.TITLE_KEY)
|
|
135
|
+
art_len = len(document.split())
|
|
136
|
+
if split == TEST_SPLIT:
|
|
137
|
+
if self.test_filter_max_length and art_len > self.test_filter_max_length:
|
|
138
|
+
continue
|
|
139
|
+
if self.test_filter_min_length and art_len < self.test_filter_min_length:
|
|
140
|
+
continue
|
|
141
|
+
if split == TRAIN_SPLIT:
|
|
142
|
+
if self.train_filter_max_length and art_len > self.train_filter_max_length:
|
|
143
|
+
continue
|
|
144
|
+
if self.train_filter_min_length and art_len < self.train_filter_min_length:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
document = SUMOSumScenario._clean_and_truncate(document, self.truncate_length)
|
|
148
|
+
title = SUMOSumScenario._clean_and_truncate(title)
|
|
149
|
+
|
|
150
|
+
instance = Instance(
|
|
151
|
+
input=Input(text=document),
|
|
152
|
+
references=[Reference(output=Output(text=title), tags=[CORRECT_TAG])],
|
|
153
|
+
split=split,
|
|
154
|
+
)
|
|
155
|
+
instances.append(instance)
|
|
156
|
+
|
|
157
|
+
return instances
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from .scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
6
6
|
|
|
7
7
|
NUM_INPUT_TOKENS: List[int] = [
|
|
8
8
|
1,
|
|
@@ -67,7 +67,17 @@ from copy import copy
|
|
|
67
67
|
from typing import List, Dict, Literal, Tuple
|
|
68
68
|
from dataclasses import dataclass
|
|
69
69
|
|
|
70
|
-
from .scenario import
|
|
70
|
+
from helm.benchmark.scenarios.scenario import (
|
|
71
|
+
Scenario,
|
|
72
|
+
Instance,
|
|
73
|
+
Reference,
|
|
74
|
+
TRAIN_SPLIT,
|
|
75
|
+
VALID_SPLIT,
|
|
76
|
+
TEST_SPLIT,
|
|
77
|
+
CORRECT_TAG,
|
|
78
|
+
Input,
|
|
79
|
+
Output,
|
|
80
|
+
)
|
|
71
81
|
|
|
72
82
|
|
|
73
83
|
@dataclass(frozen=True)
|
|
@@ -25,7 +25,17 @@ The model hence is asked to do the following three tasks:
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
from typing import List, Dict, Tuple
|
|
27
27
|
|
|
28
|
-
from .scenario import
|
|
28
|
+
from helm.benchmark.scenarios.scenario import (
|
|
29
|
+
Scenario,
|
|
30
|
+
Instance,
|
|
31
|
+
Reference,
|
|
32
|
+
TRAIN_SPLIT,
|
|
33
|
+
VALID_SPLIT,
|
|
34
|
+
TEST_SPLIT,
|
|
35
|
+
CORRECT_TAG,
|
|
36
|
+
Input,
|
|
37
|
+
Output,
|
|
38
|
+
)
|
|
29
39
|
|
|
30
40
|
ANIMALS = ["zebra", "cobra", "stork", "penguin", "shark", "lion", "buffalo", "whale", "seal", "eagle", "horse", "rat"]
|
|
31
41
|
FRUITS = ["apple", "peach", "watermelon", "banana", "grape", "kiwi", "pear", "strawberry", "blueberry", "blackberry"]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.bigcodebench_scenario import BigCodeBenchScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_bigcodebench_scenario_get_instances():
|
|
10
|
+
bigcodebench_scenario = BigCodeBenchScenario("v0.1.2")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = bigcodebench_scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(instances) == 1140
|
|
14
|
+
assert instances[0].id == "BigCodeBench/0"
|
|
15
|
+
assert instances[0].input == Input(
|
|
16
|
+
text=(
|
|
17
|
+
"Calculates the average of the sums of absolute differences between each pair "
|
|
18
|
+
"of consecutive numbers for all permutations of a given list. Each permutation "
|
|
19
|
+
"is shuffled before calculating the differences. Args: - numbers (list): A list "
|
|
20
|
+
"of numbers. Default is numbers from 1 to 10.\nThe function should output with:\n"
|
|
21
|
+
" float: The average of the sums of absolute differences for each shuffled permutation "
|
|
22
|
+
"of the list.\nYou should write self-contained code starting with:\n```\nimport itertools\n"
|
|
23
|
+
"from random import shuffle\ndef task_func(numbers=list(range(1, 3))):\n```"
|
|
24
|
+
)
|
|
25
|
+
)
|
|
26
|
+
assert instances[0].split == TEST_SPLIT
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.czech_bank_qa_scenario import CzechBankQAScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_czech_bank_qa_scenario_get_instances():
|
|
10
|
+
scenario = CzechBankQAScenario(config_name="default")
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
actual_instances = scenario.get_instances(tmpdir)
|
|
13
|
+
assert len(actual_instances) == 30
|
|
14
|
+
assert actual_instances[0].input == Input(text="Get the total number of accounts in the system")
|
|
15
|
+
assert len(actual_instances[0].references) == 1
|
|
16
|
+
assert actual_instances[0].references[0].tags == [CORRECT_TAG]
|
|
17
|
+
assert actual_instances[0].references[0].output.text == "SELECT COUNT(*) FROM ACCOUNT"
|
|
18
|
+
assert actual_instances[0].split == "test"
|