crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from random import Random
|
|
4
|
+
from typing import Any, List, Dict
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pandas import DataFrame
|
|
8
|
+
|
|
9
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
10
|
+
from helm.benchmark.scenarios.scenario import (
|
|
11
|
+
Scenario,
|
|
12
|
+
Instance,
|
|
13
|
+
Reference,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
CORRECT_TAG,
|
|
17
|
+
Input,
|
|
18
|
+
Output,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CtiToMitreScenario(Scenario):
|
|
23
|
+
"""
|
|
24
|
+
Original Task:
|
|
25
|
+
- The original task is to classify the description of the situation regarding the system
|
|
26
|
+
into the security threats in that situation.
|
|
27
|
+
- The classification categories are the approximately 200 categories of attack techniques
|
|
28
|
+
in the enterprise as defined by MITRE ATT&CK v10.1.
|
|
29
|
+
|
|
30
|
+
Implemented Task:
|
|
31
|
+
- Since classification into so many classes is difficult to handle in a generative language model
|
|
32
|
+
such as GPT itself, we implement this task as a multiple-choice task.
|
|
33
|
+
- Each choice is the name of the attack technique category into which the description is classified.
|
|
34
|
+
- The number of options is determined by the parameter (num_options).
|
|
35
|
+
- The minimum number of options is 2 and the maximum is 199, the number of all categories of
|
|
36
|
+
attack methods defined in MITRE ATT&CK v10.1.
|
|
37
|
+
- From the 199 choices, num_options choices, including the correct answer and a default case,
|
|
38
|
+
are randomly selected and used.
|
|
39
|
+
- If num_options is not specified, all 199 category names will be used as choices.
|
|
40
|
+
|
|
41
|
+
Data:
|
|
42
|
+
- dataset.csv
|
|
43
|
+
- Target dataset
|
|
44
|
+
- https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/a8cacf3185d098c686e0d88768a619a03a4d76d1/data/dataset.csv
|
|
45
|
+
- This data is of the form [sentence, label_tec, label_subtec, tec_name]
|
|
46
|
+
- sentence: the description
|
|
47
|
+
- label_tec: label for attack technique category
|
|
48
|
+
- label_subtec: label for attack technique subcategory
|
|
49
|
+
- tec_name : name(simple description) for attack technique subcategory
|
|
50
|
+
- Note: we need to extract name for attack technique category
|
|
51
|
+
from enterprise-attack.json
|
|
52
|
+
|
|
53
|
+
- enterprise-attack.json
|
|
54
|
+
- https://github.com/mitre/cti/archive/refs/tags/ATT&CK-v10.1.zip
|
|
55
|
+
- /mitre_v10/enterprise-attack/enterprise-attack.json
|
|
56
|
+
- This data contains relation from attack technique name to attack technique label
|
|
57
|
+
- we can extract attack technique category name for label_tec using this json data.
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
Prompt: (k is specified by num_options)
|
|
61
|
+
-----------------------
|
|
62
|
+
Answer the possible security attacks in each of the following situations from each of the options below.
|
|
63
|
+
[instruction]
|
|
64
|
+
|
|
65
|
+
Situation: <description> [in context examples]
|
|
66
|
+
A. <attack_category_name_1>
|
|
67
|
+
B. <attack_category_name_2>
|
|
68
|
+
...
|
|
69
|
+
Y. <attack_category_name_(k-1)>
|
|
70
|
+
Z. Others
|
|
71
|
+
Answer: <correct_answer>
|
|
72
|
+
|
|
73
|
+
... (Examples are output as long as the length allows) ...
|
|
74
|
+
|
|
75
|
+
Situation: <target_description> [target question]
|
|
76
|
+
A. <attack_category_name_t1>
|
|
77
|
+
B. <attack_category_name_t2>
|
|
78
|
+
...
|
|
79
|
+
Y. <attack_category_name_t(k-1)>
|
|
80
|
+
Z. Others
|
|
81
|
+
Answer:
|
|
82
|
+
-----------------------
|
|
83
|
+
|
|
84
|
+
Example of prompt (num_options = 5)
|
|
85
|
+
-----------------------
|
|
86
|
+
Answer the possible security attacks in each of the following situations from each of the options below.
|
|
87
|
+
|
|
88
|
+
Situation: ZxShell can launch a reverse command shell.
|
|
89
|
+
A. Command and Scripting Interpreter
|
|
90
|
+
B. System Shutdown/Reboot
|
|
91
|
+
C. Exfiltration Over C2 Channel
|
|
92
|
+
D. Direct Volume Access
|
|
93
|
+
E. Others
|
|
94
|
+
Answer: A
|
|
95
|
+
|
|
96
|
+
....(Omitted)...
|
|
97
|
+
|
|
98
|
+
Situation: APC injection is a method of executing arbitrary code in the address space.
|
|
99
|
+
A. Event Triggered Execution
|
|
100
|
+
B. Process Injection
|
|
101
|
+
C. Non-Application Layer Protocol
|
|
102
|
+
D. Escape to Host
|
|
103
|
+
E. Others
|
|
104
|
+
Answer: B
|
|
105
|
+
|
|
106
|
+
Situation: Timestomping may be used along with file name Masquerading to hide malware and tools.
|
|
107
|
+
A. Search Victim-Owned Websites
|
|
108
|
+
B. Internal Spearphishing
|
|
109
|
+
C. Application Layer Protocol
|
|
110
|
+
D. Indicator Removal on Host
|
|
111
|
+
E. Others
|
|
112
|
+
Answer:
|
|
113
|
+
-----------------------
|
|
114
|
+
""" # noqa: E501
|
|
115
|
+
|
|
116
|
+
# Names of the tasks we support
|
|
117
|
+
name = "cti_to_mitre"
|
|
118
|
+
description = "Classification of security attack opportunities on system"
|
|
119
|
+
tags = ["classification", "cyber_security"]
|
|
120
|
+
|
|
121
|
+
# Constant for splitting target data into train and test data.
|
|
122
|
+
train_ratio = 0.7
|
|
123
|
+
|
|
124
|
+
# Constant for default number of options. # of (MITRE ATT&CK attack categories) is 199 in ATT&CK-v10.1.zip
|
|
125
|
+
MAX_NUM_OPTIONS = 199
|
|
126
|
+
|
|
127
|
+
# Constant: the description for Others option
|
|
128
|
+
OTHERS_OPTION = "Others"
|
|
129
|
+
|
|
130
|
+
CTI_URL = "https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/a8cacf3185d098c686e0d88768a619a03a4d76d1/data/dataset.csv" # noqa: E501
|
|
131
|
+
MITRE_URL = "https://github.com/mitre/cti/raw/refs/tags/ATT&CK-v10.1/enterprise-attack/enterprise-attack.json"
|
|
132
|
+
|
|
133
|
+
def __init__(self, num_options: int = MAX_NUM_OPTIONS, seed: int = 42) -> None:
|
|
134
|
+
"""
|
|
135
|
+
num_options: int, number of choices in multiple-choice task
|
|
136
|
+
seed: int, seed for random module. The seed is set to random if specified
|
|
137
|
+
"""
|
|
138
|
+
super().__init__()
|
|
139
|
+
self.num_options = min(num_options, CtiToMitreScenario.MAX_NUM_OPTIONS)
|
|
140
|
+
self.random_seed = seed
|
|
141
|
+
self.random = Random(seed)
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def make_label_category_name_dict(jdata: Dict[str, Any]) -> Dict[str, str]:
|
|
145
|
+
"""
|
|
146
|
+
This makes mapping from label_tec (attack technique category label) to tec_category_name
|
|
147
|
+
(attack technique category name)
|
|
148
|
+
- jdata is json object for enterprise_attack.json
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
category_id_to_name: Dict[str, str] = {}
|
|
152
|
+
attacks = [
|
|
153
|
+
o for o in jdata["objects"] if o["type"] == "attack-pattern" and not o.get("x_mitre_is_subtechnique", True)
|
|
154
|
+
]
|
|
155
|
+
for attack in attacks:
|
|
156
|
+
ids = [ref["external_id"] for ref in attack["external_references"] if ref["source_name"] == "mitre-attack"]
|
|
157
|
+
assert len(ids) == 1
|
|
158
|
+
id = ids[0]
|
|
159
|
+
category_id_to_name[id] = attack["name"]
|
|
160
|
+
return category_id_to_name
|
|
161
|
+
|
|
162
|
+
def get_references(self, num_references: int, correct_cname: str, cnames: List[str]) -> List[Reference]:
|
|
163
|
+
"""
|
|
164
|
+
Randomly select k tec_category_names (attack technique category names) as choices.
|
|
165
|
+
However, choose not to include "excluded",
|
|
166
|
+
and if k is less than the total number of possible choices, add a default case.
|
|
167
|
+
- k : number of choices
|
|
168
|
+
- correct_cname : correct attack technique category names
|
|
169
|
+
- cnames : list containing all attack technique category names
|
|
170
|
+
"""
|
|
171
|
+
assert num_references >= 2, "Need at least 2 references for the correct choice and 'Others'"
|
|
172
|
+
num_incorrect_cname_samples = num_references - 2
|
|
173
|
+
assert num_references <= len(
|
|
174
|
+
cnames
|
|
175
|
+
), f"Cannot have more references than the number of categories, which is {len(cnames)}"
|
|
176
|
+
incorrect_cnames = [cname for cname in cnames if cname != correct_cname]
|
|
177
|
+
incorrect_cname_samples = self.random.sample(
|
|
178
|
+
incorrect_cnames, min(len(incorrect_cnames), num_incorrect_cname_samples)
|
|
179
|
+
)
|
|
180
|
+
references = [Reference(Output(text=cname), tags=[]) for cname in incorrect_cname_samples]
|
|
181
|
+
references.append(Reference(Output(text=correct_cname), tags=[CORRECT_TAG]))
|
|
182
|
+
self.random.shuffle(references)
|
|
183
|
+
if num_references <= len(cnames):
|
|
184
|
+
references.append(Reference(Output(text=CtiToMitreScenario.OTHERS_OPTION), tags=[]))
|
|
185
|
+
return references
|
|
186
|
+
|
|
187
|
+
def create_multiple_choice_instances(
|
|
188
|
+
self, df: DataFrame, split: str, label_cname: Dict[str, str]
|
|
189
|
+
) -> List[Instance]:
|
|
190
|
+
"""Create a list of instances corresponding to the multiple choice task"""
|
|
191
|
+
instances = []
|
|
192
|
+
for idx in df.index:
|
|
193
|
+
linedata = df.loc[idx]
|
|
194
|
+
sentence = linedata["sentence"]
|
|
195
|
+
label_tec = linedata["label_tec"]
|
|
196
|
+
correct_cname = label_cname[label_tec]
|
|
197
|
+
all_cnames = [cname for cname in label_cname.values()]
|
|
198
|
+
references = self.get_references(self.num_options, correct_cname, all_cnames)
|
|
199
|
+
input = Input(text=sentence)
|
|
200
|
+
instance = Instance(input, references, split=split)
|
|
201
|
+
instances.append(instance)
|
|
202
|
+
return instances
|
|
203
|
+
|
|
204
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
205
|
+
data_dir = os.path.join(output_path, "data")
|
|
206
|
+
ensure_directory_exists(data_dir)
|
|
207
|
+
|
|
208
|
+
dataset_path = os.path.join(data_dir, "dataset.csv")
|
|
209
|
+
ensure_file_downloaded(
|
|
210
|
+
source_url="https://github.com/dessertlab/cti-to-mitre-with-nlp/raw/a8cacf3185d098c686e0d88768a619a03a4d76d1/data/dataset.csv", # noqa: E501
|
|
211
|
+
target_path=dataset_path,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
labels_path = os.path.join(data_dir, "enterprise-attack.json")
|
|
215
|
+
ensure_file_downloaded(
|
|
216
|
+
source_url="https://github.com/mitre/cti/raw/refs/tags/ATT&CK-v10.1/enterprise-attack/enterprise-attack.json", # noqa: E501
|
|
217
|
+
target_path=labels_path,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# load dataset
|
|
221
|
+
all_df = pd.read_csv(dataset_path)
|
|
222
|
+
|
|
223
|
+
# split all_df into train and test data frames
|
|
224
|
+
train_df = all_df.sample(frac=CtiToMitreScenario.train_ratio, random_state=self.random_seed)
|
|
225
|
+
test_df = all_df.drop(train_df.index).sample(frac=1, random_state=self.random_seed)
|
|
226
|
+
|
|
227
|
+
# load labels
|
|
228
|
+
with open(labels_path) as f:
|
|
229
|
+
jdata = json.load(f)
|
|
230
|
+
|
|
231
|
+
# make mapping from label_tec to tec_category_name
|
|
232
|
+
label_cname = self.make_label_category_name_dict(jdata)
|
|
233
|
+
|
|
234
|
+
# create instances from each dataset
|
|
235
|
+
instances_train = self.create_multiple_choice_instances(train_df, TRAIN_SPLIT, label_cname)
|
|
236
|
+
instances_test = self.create_multiple_choice_instances(test_df, TEST_SPLIT, label_cname)
|
|
237
|
+
|
|
238
|
+
# return all instances
|
|
239
|
+
all_instances = instances_train + instances_test
|
|
240
|
+
return all_instances
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CzechBankQAScenario(Scenario):
|
|
18
|
+
INSTRUCTIONS = """Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.
|
|
19
|
+
|
|
20
|
+
Database schema:
|
|
21
|
+
CREATE TABLE "account" (
|
|
22
|
+
"account_id" integer NOT NULL DEFAULT '0'
|
|
23
|
+
, "district_id" integer NOT NULL DEFAULT '0'
|
|
24
|
+
, "frequency" varchar(18) NOT NULL
|
|
25
|
+
, "date" date NOT NULL
|
|
26
|
+
, PRIMARY KEY ("account_id")
|
|
27
|
+
, CONSTRAINT "account_ibfk_1" FOREIGN KEY ("district_id") REFERENCES "district" ("district_id")
|
|
28
|
+
);
|
|
29
|
+
CREATE TABLE "card" (
|
|
30
|
+
"card_id" integer NOT NULL DEFAULT '0'
|
|
31
|
+
, "disp_id" integer NOT NULL
|
|
32
|
+
, "type" varchar(7) NOT NULL
|
|
33
|
+
, "issued" date NOT NULL
|
|
34
|
+
, PRIMARY KEY ("card_id")
|
|
35
|
+
, CONSTRAINT "card_ibfk_1" FOREIGN KEY ("disp_id") REFERENCES "disp" ("disp_id")
|
|
36
|
+
);
|
|
37
|
+
CREATE TABLE "client" (
|
|
38
|
+
"client_id" integer NOT NULL
|
|
39
|
+
, "gender" varchar(1) NOT NULL
|
|
40
|
+
, "birth_date" date NOT NULL
|
|
41
|
+
, "district_id" integer NOT NULL
|
|
42
|
+
, PRIMARY KEY ("client_id")
|
|
43
|
+
, CONSTRAINT "client_ibfk_1" FOREIGN KEY ("district_id") REFERENCES "district" ("district_id")
|
|
44
|
+
);
|
|
45
|
+
CREATE TABLE "disp" (
|
|
46
|
+
"disp_id" integer NOT NULL
|
|
47
|
+
, "client_id" integer NOT NULL
|
|
48
|
+
, "account_id" integer NOT NULL
|
|
49
|
+
, "type" varchar(9) NOT NULL
|
|
50
|
+
, PRIMARY KEY ("disp_id")
|
|
51
|
+
, CONSTRAINT "disp_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
|
|
52
|
+
, CONSTRAINT "disp_ibfk_2" FOREIGN KEY ("client_id") REFERENCES "client" ("client_id")
|
|
53
|
+
);
|
|
54
|
+
CREATE TABLE "district" (
|
|
55
|
+
"district_id" integer NOT NULL DEFAULT '0'
|
|
56
|
+
, "A2" varchar(19) NOT NULL
|
|
57
|
+
, "A3" varchar(15) NOT NULL
|
|
58
|
+
, "A4" integer NOT NULL
|
|
59
|
+
, "A5" integer NOT NULL
|
|
60
|
+
, "A6" integer NOT NULL
|
|
61
|
+
, "A7" integer NOT NULL
|
|
62
|
+
, "A8" integer NOT NULL
|
|
63
|
+
, "A9" integer NOT NULL
|
|
64
|
+
, "A10" decimal(4,1) NOT NULL
|
|
65
|
+
, "A11" integer NOT NULL
|
|
66
|
+
, "A12" decimal(4,1) DEFAULT NULL
|
|
67
|
+
, "A13" decimal(3,2) NOT NULL
|
|
68
|
+
, "A14" integer NOT NULL
|
|
69
|
+
, "A15" integer DEFAULT NULL
|
|
70
|
+
, "A16" integer NOT NULL
|
|
71
|
+
, PRIMARY KEY ("district_id")
|
|
72
|
+
);
|
|
73
|
+
CREATE TABLE "loan" (
|
|
74
|
+
"loan_id" integer NOT NULL DEFAULT '0'
|
|
75
|
+
, "account_id" integer NOT NULL
|
|
76
|
+
, "date" date NOT NULL
|
|
77
|
+
, "amount" integer NOT NULL
|
|
78
|
+
, "duration" integer NOT NULL
|
|
79
|
+
, "payments" decimal(6,2) NOT NULL
|
|
80
|
+
, "status" varchar(1) NOT NULL
|
|
81
|
+
, PRIMARY KEY ("loan_id")
|
|
82
|
+
, CONSTRAINT "loan_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
|
|
83
|
+
);
|
|
84
|
+
CREATE TABLE "order" (
|
|
85
|
+
"order_id" integer NOT NULL DEFAULT '0'
|
|
86
|
+
, "account_id" integer NOT NULL
|
|
87
|
+
, "bank_to" varchar(2) NOT NULL
|
|
88
|
+
, "account_to" integer NOT NULL
|
|
89
|
+
, "amount" decimal(6,1) NOT NULL
|
|
90
|
+
, "k_symbol" varchar(8) NOT NULL
|
|
91
|
+
, PRIMARY KEY ("order_id")
|
|
92
|
+
, CONSTRAINT "order_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
|
|
93
|
+
);
|
|
94
|
+
CREATE TABLE "trans" (
|
|
95
|
+
"trans_id" integer NOT NULL DEFAULT '0'
|
|
96
|
+
, "account_id" integer NOT NULL DEFAULT '0'
|
|
97
|
+
, "date" date NOT NULL
|
|
98
|
+
, "type" varchar(6) NOT NULL
|
|
99
|
+
, "operation" varchar(14) DEFAULT NULL
|
|
100
|
+
, "amount" integer NOT NULL
|
|
101
|
+
, "balance" integer NOT NULL
|
|
102
|
+
, "k_symbol" varchar(11) DEFAULT NULL
|
|
103
|
+
, "bank" varchar(2) DEFAULT NULL
|
|
104
|
+
, "account" integer DEFAULT NULL
|
|
105
|
+
, PRIMARY KEY ("trans_id")
|
|
106
|
+
, CONSTRAINT "trans_ibfk_1" FOREIGN KEY ("account_id") REFERENCES "account" ("account_id")
|
|
107
|
+
);""" # noqa: E501
|
|
108
|
+
|
|
109
|
+
"""CzechBankQA"""
|
|
110
|
+
name = "czech_bank_qa"
|
|
111
|
+
description = "This is a list of SQL queries for a text-to-SQL task over the Czech Bank 1999 dataset."
|
|
112
|
+
tags = ["text_to_sql"]
|
|
113
|
+
|
|
114
|
+
def __init__(self, config_name: str):
|
|
115
|
+
super().__init__()
|
|
116
|
+
self.config_name = config_name
|
|
117
|
+
|
|
118
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
119
|
+
cache_dir = os.path.join(output_path, "data")
|
|
120
|
+
ensure_directory_exists(cache_dir)
|
|
121
|
+
dataset = datasets.load_dataset(
|
|
122
|
+
"yifanmai/czech_bank_qa", name=self.config_name, split="test", cache_dir=cache_dir
|
|
123
|
+
)
|
|
124
|
+
instances: List[Instance] = []
|
|
125
|
+
for row in dataset:
|
|
126
|
+
input = Input(text=row["description"])
|
|
127
|
+
references = [Reference(output=Output(text=row["sql_query"]), tags=[CORRECT_TAG])]
|
|
128
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
129
|
+
instances.append(instance)
|
|
130
|
+
return instances
|
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
from typing import List
|
|
4
4
|
from helm.common.general import ensure_file_downloaded
|
|
5
5
|
from helm.common.general import ensure_directory_exists
|
|
6
|
-
from .scenario import Scenario, Instance, Input, Reference, CORRECT_TAG, Output, VALID_SPLIT
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, CORRECT_TAG, Output, VALID_SPLIT
|
|
7
7
|
|
|
8
8
|
TASKS = {
|
|
9
9
|
"counterfactual": ["snli_premise", "snli_hypothesis"]
|
|
@@ -5,7 +5,7 @@ import random
|
|
|
5
5
|
from tqdm import tqdm
|
|
6
6
|
from typing import List, Dict, Optional
|
|
7
7
|
from helm.common.general import ensure_file_downloaded
|
|
8
|
-
from .scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
|
|
9
9
|
|
|
10
10
|
option_keys = ["pii", "privacy_understanding", "enron_email_extraction"]
|
|
11
11
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Dict
|
|
4
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
from helm.benchmark.scenarios.scenario import Reference, Output
|
|
7
7
|
|
|
@@ -4,7 +4,7 @@ import random
|
|
|
4
4
|
from typing import List, Dict
|
|
5
5
|
|
|
6
6
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from .scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
DATA_REPO_HASH = "38972f6ccbf376a8d0660babafb4d2b3b9cca3f4"
|
|
@@ -6,7 +6,18 @@ from typing import Dict, List, Optional
|
|
|
6
6
|
|
|
7
7
|
from helm.common.general import ensure_file_downloaded
|
|
8
8
|
from helm.common.hierarchical_logger import hlog
|
|
9
|
-
from .
|
|
9
|
+
from helm.benchmark.runner import get_benchmark_output_path
|
|
10
|
+
from helm.benchmark.scenarios.scenario import (
|
|
11
|
+
Scenario,
|
|
12
|
+
Instance,
|
|
13
|
+
Reference,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
VALID_SPLIT,
|
|
16
|
+
TEST_SPLIT,
|
|
17
|
+
CORRECT_TAG,
|
|
18
|
+
Input,
|
|
19
|
+
Output,
|
|
20
|
+
)
|
|
10
21
|
|
|
11
22
|
|
|
12
23
|
@dataclass(frozen=True)
|
|
@@ -138,5 +149,5 @@ class EmpatheticDialoguesScenario(Scenario):
|
|
|
138
149
|
|
|
139
150
|
if __name__ == "__main__":
|
|
140
151
|
scenario = EmpatheticDialoguesScenario()
|
|
141
|
-
instances = scenario.get_instances("
|
|
152
|
+
instances = scenario.get_instances(os.path.join(get_benchmark_output_path(), "scenarios/empatheticdialogues"))
|
|
142
153
|
print(instances[100])
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from helm.common.general import ensure_directory_exists
|
|
3
|
+
from helm.benchmark.scenarios.scenario import (
|
|
4
|
+
Input,
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
TEST_SPLIT,
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
Reference,
|
|
10
|
+
Output,
|
|
11
|
+
)
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def file_preprocessing(data_path: str, task_objective: str) -> pd.DataFrame:
|
|
16
|
+
"""
|
|
17
|
+
Preprocess the data files to create a DataFrame with the necessary columns.
|
|
18
|
+
task_objective: 'brief_hospital_course' or 'discharge_instructions'
|
|
19
|
+
Use command to download: wget -r -N -c -np --user {PHYSIONET_USERNAME} \
|
|
20
|
+
--ask-password https://physionet.org/files/discharge-me/1.3/
|
|
21
|
+
data_path is directory that contains the downloaded files: '{base_dir}/physionet.org/'
|
|
22
|
+
"""
|
|
23
|
+
# Load the first CSV file
|
|
24
|
+
df_diagnosis = pd.read_csv(
|
|
25
|
+
f"{data_path}/files/discharge-me/1.3/test_phase_1/diagnosis.csv.gz", compression="gzip", keep_default_na=False
|
|
26
|
+
)
|
|
27
|
+
df_discharge = pd.read_csv(
|
|
28
|
+
f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge.csv.gz", compression="gzip", keep_default_na=False
|
|
29
|
+
)
|
|
30
|
+
df_target = pd.read_csv(
|
|
31
|
+
f"{data_path}/files/discharge-me/1.3/test_phase_1/discharge_target.csv.gz",
|
|
32
|
+
compression="gzip",
|
|
33
|
+
keep_default_na=False,
|
|
34
|
+
)
|
|
35
|
+
df_radiology = pd.read_csv(
|
|
36
|
+
f"{data_path}/files/discharge-me/1.3/test_phase_1/radiology.csv.gz", compression="gzip", keep_default_na=False
|
|
37
|
+
)
|
|
38
|
+
df_ed = pd.read_csv(
|
|
39
|
+
f"{data_path}/files/discharge-me/1.3/test_phase_1/edstays.csv.gz", compression="gzip", keep_default_na=False
|
|
40
|
+
)
|
|
41
|
+
df_triage = pd.read_csv(
|
|
42
|
+
f"{data_path}/files/discharge-me/1.3/test_phase_1/triage.csv.gz", compression="gzip", keep_default_na=False
|
|
43
|
+
)
|
|
44
|
+
df_diagnosis_triage = pd.merge(
|
|
45
|
+
df_diagnosis, df_triage, on="subject_id", how="inner", suffixes=("_df_diagnosis", "_df_triage")
|
|
46
|
+
)
|
|
47
|
+
df_diagnosis_triage_discharge = pd.merge(
|
|
48
|
+
df_diagnosis_triage, df_discharge, on="subject_id", how="inner", suffixes=("", "_df_discharge")
|
|
49
|
+
)
|
|
50
|
+
df_diagnosis_triage_discharge_radiology = pd.merge(
|
|
51
|
+
df_diagnosis_triage_discharge, df_radiology, on="hadm_id", how="inner", suffixes=("", "_df_radiology")
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
df_features = pd.merge(
|
|
55
|
+
df_diagnosis_triage_discharge_radiology, df_ed, on="hadm_id", how="inner", suffixes=("", "_df_ed")
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Reduce the DataFrame to remove duplicate hadm_id
|
|
59
|
+
df_features_reduced = df_features.drop_duplicates(subset="hadm_id")
|
|
60
|
+
columns_to_keep = ["text", "text_df_radiology", "hadm_id"]
|
|
61
|
+
df_input = df_features_reduced[columns_to_keep]
|
|
62
|
+
final_df = pd.merge(df_input, df_target, on="hadm_id", how="inner")
|
|
63
|
+
|
|
64
|
+
def remove_substring(string, substring):
|
|
65
|
+
return string.replace(substring, "")
|
|
66
|
+
|
|
67
|
+
final_df["text"] = final_df.apply(lambda row: remove_substring(row["text"], row[task_objective]), axis=1)
|
|
68
|
+
return final_df
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def create_prompt(text: str, text_df_radiology: str, task_objective: str) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Create the prompt for the instance.
|
|
74
|
+
"""
|
|
75
|
+
prompt = f"Generate the {task_objective} from the following patient discharge text and radiology report text.\
|
|
76
|
+
\n\nDischarge Text:\n{text}\n\nRadiology Report:\n{text_df_radiology}\n\n{task_objective}:\n"
|
|
77
|
+
return prompt
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class DischargeMeScenario(Scenario):
|
|
81
|
+
"""
|
|
82
|
+
DischargeMe is a discharge instruction generation dataset and brief hospital course generation \
|
|
83
|
+
dataset collected from MIMIC-IV data.
|
|
84
|
+
In this scenario, we only consider the discharge text as well as the radiology report text.
|
|
85
|
+
We are using the phase I test set which is composed of 14,702 hospital admission instances.
|
|
86
|
+
|
|
87
|
+
The splits are provided by the dataset itself.
|
|
88
|
+
|
|
89
|
+
TASKS = {discharge instruction, brief hospital course}
|
|
90
|
+
Sample Synthetic Prompt:
|
|
91
|
+
Generate the {TASK} from the following patient discharge text and radiology report text.
|
|
92
|
+
|
|
93
|
+
Discharge Text:
|
|
94
|
+
Name: {Patient Name} Unit No: {Unit Number} Date of Birth: {DOB} Date of Admission:
|
|
95
|
+
{DOA} Date of Discharge: {DOD}
|
|
96
|
+
Chief Complaint: {Chief Complaint} History of Present Illness: {HPI} Past Medical History: {PMH}
|
|
97
|
+
Medications on Admission: {Medications} Allergies: {Allergies} Physical Exam: {Physical Exam}
|
|
98
|
+
Discharge Diagnosis: {Discharge Diagnosis}
|
|
99
|
+
|
|
100
|
+
Radiology Report:
|
|
101
|
+
{Radiology Report}
|
|
102
|
+
|
|
103
|
+
{TASK}:
|
|
104
|
+
@inproceedings{Xu_2024,
|
|
105
|
+
title={ Discharge me: Bionlp acl’24 shared task on streamlining discharge documentation.},
|
|
106
|
+
url={https://doi.org/10.13026/4a0k-4360},
|
|
107
|
+
DOI={10.13026/27pt-1259},
|
|
108
|
+
booktitle={ Proceedings of the 23rd Workshop on Biomedical Natural Language Processing (BioNLP) at ACL 2024},
|
|
109
|
+
publisher={Association for Computational Linguistics},
|
|
110
|
+
author={Xu, Justin and Delbrouck, Jean-Benoit and Johnston, Andrew and Blankemeier, Louis and Langlotz, Curtis},
|
|
111
|
+
year={2024}
|
|
112
|
+
}
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
name = "dischargeme"
|
|
116
|
+
description = "DischargeMe is a discharge instruction generation dataset and brief hospital course generation \
|
|
117
|
+
dataset collected from MIMIC-IV data, consindering only the discharge text as well as the radiology report text."
|
|
118
|
+
tags = ["biomedical"]
|
|
119
|
+
|
|
120
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
121
|
+
data_path = "/share/pi/nigam/data/physionet.org"
|
|
122
|
+
ensure_directory_exists(data_path)
|
|
123
|
+
instances: List[Instance] = []
|
|
124
|
+
df_bhc = file_preprocessing(data_path, "brief_hospital_course")
|
|
125
|
+
df_di = file_preprocessing(data_path, "discharge_instructions")
|
|
126
|
+
|
|
127
|
+
for i in range(df_bhc.shape[0]):
|
|
128
|
+
prompt_bhc = create_prompt(
|
|
129
|
+
df_bhc.iloc[i]["text"], df_bhc.iloc[i]["text_df_radiology"], "Brief Hospital Course"
|
|
130
|
+
)
|
|
131
|
+
prompt_di = create_prompt(
|
|
132
|
+
df_di.iloc[i]["text"], df_di.iloc[i]["text_df_radiology"], "Discharge Instructions"
|
|
133
|
+
)
|
|
134
|
+
answer_bhc = df_bhc.iloc[i]["brief_hospital_course"]
|
|
135
|
+
answer_di = df_di.iloc[i]["discharge_instructions"]
|
|
136
|
+
instances.append(
|
|
137
|
+
Instance(
|
|
138
|
+
input=Input(text=prompt_bhc),
|
|
139
|
+
references=[Reference(Output(text=answer_bhc), tags=[CORRECT_TAG])],
|
|
140
|
+
split=TEST_SPLIT,
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
instances.append(
|
|
144
|
+
Instance(
|
|
145
|
+
input=Input(text=prompt_di),
|
|
146
|
+
references=[Reference(Output(text=answer_di), tags=[CORRECT_TAG])],
|
|
147
|
+
split=TEST_SPLIT,
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return instances
|
|
152
|
+
|
|
153
|
+
def read_file(self, file_path: str) -> List[str]:
|
|
154
|
+
with open(file_path, "r") as file:
|
|
155
|
+
lines = file.readlines()
|
|
156
|
+
lines = [line.strip() for line in lines]
|
|
157
|
+
return lines
|
|
@@ -3,7 +3,16 @@ import os
|
|
|
3
3
|
from typing import List, Dict, Optional
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
7
16
|
|
|
8
17
|
REITERATION_DATA_URL = "https://drive.google.com/uc?export=download&id=1uVJbsgPCHFAvH43I6SVvU3Ayo8dh-y_N"
|
|
9
18
|
WEDGING_DATA_URL = "https://drive.google.com/uc?export=download&id=1kWB3_F4Tobc_oVGC_T-a5DHEh-AB4GTc"
|
|
@@ -2,7 +2,16 @@ import numpy as np
|
|
|
2
2
|
import random
|
|
3
3
|
from typing import List, Tuple
|
|
4
4
|
|
|
5
|
-
from .scenario import
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
class DyckLanguageScenario(Scenario):
|