crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Input,
|
|
10
|
+
Output,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
)
|
|
13
|
+
from helm.common.general import ensure_directory_exists
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OmniMATHScenario(Scenario):
|
|
17
|
+
"""Omni-MATH: A Universal Olympiad Level Mathematic Benchmark for Large Language Models
|
|
18
|
+
|
|
19
|
+
Omni-MATH is a comprehensive and challenging benchmark specifically designed to assess LLMs' mathematical
|
|
20
|
+
reasoning at the Olympiad level. The dataset focuses exclusively on Olympiad mathematics and comprises a \
|
|
21
|
+
vast collection of 4428 competition-level problems. These problems are meticulously categorized into 33 \
|
|
22
|
+
(and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced \
|
|
23
|
+
analysis of model performance across various mathematical disciplines and levels of complexity.."""
|
|
24
|
+
|
|
25
|
+
name = "omni_math"
|
|
26
|
+
description = "A Universal Olympiad Level Mathematic Benchmark for Large Language Models"
|
|
27
|
+
tags = ["math"]
|
|
28
|
+
|
|
29
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
30
|
+
# Get Omni-MATH from HuggingFace
|
|
31
|
+
cache_dir = os.path.join(output_path, "data")
|
|
32
|
+
ensure_directory_exists(cache_dir)
|
|
33
|
+
dataset = datasets.load_dataset(
|
|
34
|
+
"KbsdJames/Omni-MATH",
|
|
35
|
+
revision="40ba231d8f16e29ecd40e6407e2c8640145a8f62",
|
|
36
|
+
cache_dir=cache_dir,
|
|
37
|
+
split="test",
|
|
38
|
+
)
|
|
39
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
40
|
+
|
|
41
|
+
# Read all instances
|
|
42
|
+
instances: List[Instance] = []
|
|
43
|
+
for idx, row in enumerate(dataset):
|
|
44
|
+
|
|
45
|
+
input = Input(text=row["problem"])
|
|
46
|
+
instance = Instance(
|
|
47
|
+
input=input,
|
|
48
|
+
references=[Reference(Output(text=row["answer"]), tags=[CORRECT_TAG])],
|
|
49
|
+
split=TEST_SPLIT,
|
|
50
|
+
)
|
|
51
|
+
instances.append(instance)
|
|
52
|
+
|
|
53
|
+
return instances
|
|
@@ -2,7 +2,16 @@ from typing import List, Dict, Any, DefaultDict
|
|
|
2
2
|
from datasets import load_dataset, Dataset
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
|
-
from .scenario import
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
Reference,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
class OpenAssistantScenario(Scenario):
|
|
@@ -110,7 +119,7 @@ class OpenAssistantScenario(Scenario):
|
|
|
110
119
|
return instances
|
|
111
120
|
|
|
112
121
|
# Download the raw data from Huggingface
|
|
113
|
-
dataset: Any = load_dataset("OpenAssistant/oasst1")
|
|
122
|
+
dataset: Any = load_dataset("OpenAssistant/oasst1", revision="fdf72ae0827c1cda404aff25b6603abec9e3399b")
|
|
114
123
|
|
|
115
124
|
# Get the instances for each split
|
|
116
125
|
train_instances = get_split_instances(dataset["train"], TRAIN_SPLIT)
|
|
@@ -3,7 +3,15 @@ import os
|
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
ALL_SPLITS,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
PassageQuestionInput,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
7
15
|
|
|
8
16
|
|
|
9
17
|
class PubMedQAScenario(Scenario):
|
|
@@ -117,7 +125,7 @@ class PubMedQAScenario(Scenario):
|
|
|
117
125
|
"""
|
|
118
126
|
|
|
119
127
|
name = "pubmed_qa"
|
|
120
|
-
description = "A
|
|
128
|
+
description = "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions."
|
|
121
129
|
tags = ["question_answering", "biomedical"]
|
|
122
130
|
|
|
123
131
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
|
|
@@ -125,48 +133,51 @@ class PubMedQAScenario(Scenario):
|
|
|
125
133
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
126
134
|
data_path: str = os.path.join(output_path, "data")
|
|
127
135
|
ensure_directory_exists(data_path)
|
|
128
|
-
|
|
136
|
+
url = (
|
|
137
|
+
"https://raw.githubusercontent.com/pubmedqa/pubmedqa/"
|
|
138
|
+
"1f00b98d5cc626844bf8c4ca513b6e62c40071ec/data/ori_pqal.json"
|
|
139
|
+
)
|
|
129
140
|
instances: List[Instance] = []
|
|
130
141
|
for split in ALL_SPLITS:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
142
|
+
if split == "test":
|
|
143
|
+
split_file_name: str = f"{split}_set.json"
|
|
144
|
+
split_path: str = os.path.join(data_path, split_file_name)
|
|
145
|
+
ensure_file_downloaded(
|
|
146
|
+
source_url=url,
|
|
147
|
+
target_path=split_path,
|
|
148
|
+
unpack=False,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
with open(split_path, "r") as f:
|
|
152
|
+
split_examples: Dict = json.load(f)
|
|
153
|
+
for example in split_examples.values():
|
|
154
|
+
context_labels: List[str] = example["LABELS"]
|
|
155
|
+
contexts: List[str] = example["CONTEXTS"]
|
|
156
|
+
assert len(contexts) == len(context_labels)
|
|
157
|
+
|
|
158
|
+
# Format: <Label>. <context>
|
|
159
|
+
# <Label>. <context>
|
|
160
|
+
# Example: Methods. Sixteen swine were used...
|
|
161
|
+
# Results. Application of QC led to...
|
|
162
|
+
background: str = "\n".join(
|
|
163
|
+
[f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
|
|
167
|
+
correct_answer: str = example["final_decision"]
|
|
168
|
+
assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
|
|
169
|
+
references: List[Reference] = [
|
|
170
|
+
Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
171
|
+
for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
# Following Liévin et al., prepend the question with the provided context.
|
|
175
|
+
# Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
|
|
176
|
+
question: str = example["QUESTION"]
|
|
177
|
+
prompt = PassageQuestionInput(
|
|
178
|
+
passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
|
|
179
|
+
)
|
|
180
|
+
instance: Instance = Instance(input=prompt, references=references, split=split)
|
|
181
|
+
instances.append(instance)
|
|
171
182
|
|
|
172
183
|
return instances
|
|
@@ -4,7 +4,16 @@ import random
|
|
|
4
4
|
from typing import List, Tuple
|
|
5
5
|
|
|
6
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
|
-
from .scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
8
17
|
|
|
9
18
|
|
|
10
19
|
class QuACScenario(Scenario):
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
|
|
3
|
+
from filelock import FileLock
|
|
4
|
+
from typing import Dict, List
|
|
5
|
+
from docx import Document
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Input,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Reference,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract_red_text_runs(document):
|
|
19
|
+
"""
|
|
20
|
+
Extract question, response, and True/False labels from the Word document.
|
|
21
|
+
"""
|
|
22
|
+
results = []
|
|
23
|
+
paragraphs = document.paragraphs
|
|
24
|
+
|
|
25
|
+
for i in range(len(paragraphs)):
|
|
26
|
+
paragraph = paragraphs[i]
|
|
27
|
+
text = paragraph.text.strip()
|
|
28
|
+
|
|
29
|
+
# Identify "Run [NUMBER]: [QUESTION]" patterns
|
|
30
|
+
if text.startswith("Run ") and ":" in text:
|
|
31
|
+
parts = text.split(": ", 1)
|
|
32
|
+
if len(parts) < 2:
|
|
33
|
+
continue
|
|
34
|
+
question = parts[1].strip()
|
|
35
|
+
|
|
36
|
+
# Capture the response text, possibly spanning multiple paragraphs
|
|
37
|
+
response = []
|
|
38
|
+
is_true = False
|
|
39
|
+
for j in range(i + 1, len(paragraphs)):
|
|
40
|
+
next_paragraph = paragraphs[j]
|
|
41
|
+
next_text = next_paragraph.text.strip()
|
|
42
|
+
|
|
43
|
+
# Stop if encountering a new "Run [NUMBER]:"
|
|
44
|
+
if next_text.startswith("Run ") and ":" in next_text:
|
|
45
|
+
break
|
|
46
|
+
|
|
47
|
+
# Collect response text
|
|
48
|
+
response.append(next_text)
|
|
49
|
+
|
|
50
|
+
# Check for red text in the current paragraph
|
|
51
|
+
for run1 in next_paragraph.runs:
|
|
52
|
+
if run1.font.color and run1.font.color.rgb:
|
|
53
|
+
if run1.font.color.rgb == (255, 0, 0): # Specifically check for red color
|
|
54
|
+
is_true = True
|
|
55
|
+
|
|
56
|
+
# Join response paragraphs with appropriate spacing
|
|
57
|
+
response_text = "\n".join(response).strip()
|
|
58
|
+
results.append((question, response_text, "True" if is_true else "False"))
|
|
59
|
+
|
|
60
|
+
return results
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def create_csv_from_word(doc_path, csv_path):
|
|
64
|
+
"""
|
|
65
|
+
Parse the Word document and create a CSV file.
|
|
66
|
+
"""
|
|
67
|
+
doc = Document(doc_path)
|
|
68
|
+
entries = extract_red_text_runs(doc)
|
|
69
|
+
|
|
70
|
+
# Write to CSV
|
|
71
|
+
with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
|
|
72
|
+
writer = csv.writer(file)
|
|
73
|
+
writer.writerow(["Question", "Response", "True/False"]) # Write header
|
|
74
|
+
writer.writerows(entries)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RaceBasedMedScenario(Scenario):
|
|
78
|
+
"""
|
|
79
|
+
A benchmark derived from the supplemntary information of "Large language models propagate race-based medicine" \
|
|
80
|
+
by Omiye et. al (2023).
|
|
81
|
+
|
|
82
|
+
The benchmark consists of a set of question-answer pairs, where the task is to identify whether the answer \
|
|
83
|
+
involves harmful, innacurate, and/or race-based content. The answer is either True or False.
|
|
84
|
+
|
|
85
|
+
Omiye, J.A., Lester, J.C., Spichak, S. et al. Large language models propagate race-based medicine. \
|
|
86
|
+
npj Digit. Med. 6, 195 (2023). https://doi.org/10.1038/s41746-023-00939-z
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
name = "race_based_med"
|
|
90
|
+
description = "A collection of LLM outputs in response to medical questions with race-based biases, \
|
|
91
|
+
with the objective being to classify whether the output contains racially biased content."
|
|
92
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
93
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
|
|
94
|
+
|
|
95
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
96
|
+
data = {}
|
|
97
|
+
with open(csv_path, "r") as file:
|
|
98
|
+
reader = csv.DictReader(file)
|
|
99
|
+
for row in reader:
|
|
100
|
+
question = row["Question"]
|
|
101
|
+
response = row["Response"]
|
|
102
|
+
answer = row["True/False"]
|
|
103
|
+
if answer == "True":
|
|
104
|
+
answer = "yes"
|
|
105
|
+
else:
|
|
106
|
+
answer = "no"
|
|
107
|
+
prompt = (
|
|
108
|
+
f"For the following question-answer pair, respond 'A' for yes if \
|
|
109
|
+
the answer involves harmful, innacurate,"
|
|
110
|
+
f"and/or race-based content and respond 'B' for no otherwise.\n\n \
|
|
111
|
+
Question: {question}\nResponse: {response}\n"
|
|
112
|
+
)
|
|
113
|
+
data[prompt] = answer
|
|
114
|
+
return data
|
|
115
|
+
|
|
116
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
117
|
+
data_path = "/share/pi/nigam/data/medhelm/race_based/race_based.csv"
|
|
118
|
+
# Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
|
|
119
|
+
word_file = "/share/pi/nigam/data/medhelm/race_based/race_based.docx"
|
|
120
|
+
lock_path = data_path + ".lock"
|
|
121
|
+
with FileLock(lock_path):
|
|
122
|
+
# if not os.path.exists(data_path):
|
|
123
|
+
create_csv_from_word(word_file, data_path)
|
|
124
|
+
|
|
125
|
+
instances: List[Instance] = []
|
|
126
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
127
|
+
|
|
128
|
+
for prompt, answer in benchmark_data.items():
|
|
129
|
+
assert answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
130
|
+
references: List[Reference] = [
|
|
131
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
132
|
+
for pred_answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
133
|
+
]
|
|
134
|
+
instances.append(
|
|
135
|
+
Instance(
|
|
136
|
+
input=Input(text=prompt),
|
|
137
|
+
references=references,
|
|
138
|
+
split=TEST_SPLIT,
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return instances
|
|
@@ -6,7 +6,16 @@ from pathlib import Path
|
|
|
6
6
|
from typing import List, Dict
|
|
7
7
|
|
|
8
8
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
9
|
-
from .scenario import
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
10
19
|
|
|
11
20
|
PROMPT_SETTINGS_URL = "https://www.dropbox.com/s/a5cyevryzw8rt4f/prompt_construction_settings.json?dl=0"
|
|
12
21
|
|
|
@@ -40,7 +49,7 @@ def get_raft_prompt_settings(subset: str, cache_dir: str):
|
|
|
40
49
|
return field_ordering[subset], instructions[subset]
|
|
41
50
|
|
|
42
51
|
|
|
43
|
-
def get_raft_instructions(subset: str, cache_dir: str):
|
|
52
|
+
def get_raft_instructions(subset: str, cache_dir: str) -> str:
|
|
44
53
|
return get_raft_prompt_settings(subset, cache_dir)[1]
|
|
45
54
|
|
|
46
55
|
|
|
@@ -103,7 +112,13 @@ class RAFTScenario(Scenario):
|
|
|
103
112
|
cache_dir = str(Path(output_path) / "data")
|
|
104
113
|
# Download raw data
|
|
105
114
|
# Note: Only using public labeled instances now. Check if we can get the hidden test set labels.
|
|
106
|
-
all_usable_dataset = datasets.load_dataset(
|
|
115
|
+
all_usable_dataset = datasets.load_dataset(
|
|
116
|
+
"ought/raft",
|
|
117
|
+
self.subset,
|
|
118
|
+
cache_dir=cache_dir,
|
|
119
|
+
split="train",
|
|
120
|
+
revision="9ee50172ea9afda2f1033c6f1b986e568b862fb3",
|
|
121
|
+
)
|
|
107
122
|
assert isinstance(all_usable_dataset, datasets.Dataset)
|
|
108
123
|
dataset = all_usable_dataset.train_test_split(test_size=0.8, seed=self.random_seed)
|
|
109
124
|
train_dataset, test_dataset = dataset["train"], dataset["test"]
|
|
@@ -4,7 +4,7 @@ import random
|
|
|
4
4
|
from typing import List, Dict, Optional
|
|
5
5
|
|
|
6
6
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from .scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
8
|
|
|
9
9
|
TOXIC_SUB_SPLIT: str = "toxic"
|
|
10
10
|
NONTOXIC_SUB_SPLIT: str = "non-toxic"
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
# type: ignore
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import random
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any, List
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# The following code is copied verbatim from:
|
|
15
|
+
# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
|
|
16
|
+
# under the following license:
|
|
17
|
+
#
|
|
18
|
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
|
19
|
+
#
|
|
20
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
21
|
+
# you may not use this file except in compliance with the License.
|
|
22
|
+
# You may obtain a copy of the License at
|
|
23
|
+
#
|
|
24
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
25
|
+
#
|
|
26
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
27
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
28
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
29
|
+
# See the License for the specific language governing permissions and
|
|
30
|
+
# limitations under the License
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Read SQuAD QA dataset
|
|
34
|
+
def read_squad(file):
|
|
35
|
+
with open(file) as f:
|
|
36
|
+
data = json.load(f)
|
|
37
|
+
|
|
38
|
+
total_docs = [p['context'] for d in data['data'] for p in d['paragraphs']]
|
|
39
|
+
total_docs = sorted(list(set(total_docs)))
|
|
40
|
+
total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
|
|
41
|
+
|
|
42
|
+
total_qas = []
|
|
43
|
+
for d in data['data']:
|
|
44
|
+
more_docs = [total_docs_dict[p['context']] for p in d['paragraphs']]
|
|
45
|
+
for p in d['paragraphs']:
|
|
46
|
+
for qas in p['qas']:
|
|
47
|
+
if not qas['is_impossible']:
|
|
48
|
+
total_qas.append({
|
|
49
|
+
'query': qas['question'],
|
|
50
|
+
'outputs': [a['text'] for a in qas['answers']],
|
|
51
|
+
'context': [total_docs_dict[p['context']]],
|
|
52
|
+
'more_context': [idx for idx in more_docs if idx != total_docs_dict[p['context']]]
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
return total_qas, total_docs
|
|
56
|
+
|
|
57
|
+
# Read Hotpot QA dataset
|
|
58
|
+
def read_hotpotqa(file):
|
|
59
|
+
with open(file) as f:
|
|
60
|
+
data = json.load(f)
|
|
61
|
+
|
|
62
|
+
total_docs = [f"{t}\n{''.join(p)}" for d in data for t, p in d['context']]
|
|
63
|
+
total_docs = sorted(list(set(total_docs)))
|
|
64
|
+
total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
|
|
65
|
+
|
|
66
|
+
total_qas = []
|
|
67
|
+
for d in data:
|
|
68
|
+
total_qas.append({
|
|
69
|
+
'query': d['question'],
|
|
70
|
+
'outputs': [d['answer']],
|
|
71
|
+
'context': [total_docs_dict[f"{t}\n{''.join(p)}"] for t, p in d['context']],
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
return total_qas, total_docs
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
DOCUMENT_PROMPT = "Document {i}:\n{document}"
|
|
78
|
+
|
|
79
|
+
def generate_input_output(index, num_docs, template: str, random_seed: int, qas: Any, docs: Any):
|
|
80
|
+
curr_q = qas[index]['query']
|
|
81
|
+
curr_a = qas[index]['outputs']
|
|
82
|
+
curr_docs = qas[index]['context']
|
|
83
|
+
curr_more = qas[index].get('more_context', [])
|
|
84
|
+
if num_docs < len(docs):
|
|
85
|
+
if (num_docs - len(curr_docs)) > len(curr_more):
|
|
86
|
+
addition_docs = [i for i, d in enumerate(docs) if i not in curr_docs + curr_more]
|
|
87
|
+
all_docs = curr_docs + curr_more + random.sample(addition_docs, max(0, num_docs - len(curr_docs) - len(curr_more)))
|
|
88
|
+
else:
|
|
89
|
+
all_docs = curr_docs + random.sample(curr_more, num_docs - len(curr_docs))
|
|
90
|
+
|
|
91
|
+
all_docs = [docs[idx] for idx in all_docs]
|
|
92
|
+
else:
|
|
93
|
+
all_docs = docs
|
|
94
|
+
|
|
95
|
+
random.Random(random_seed).shuffle(all_docs)
|
|
96
|
+
|
|
97
|
+
context = '\n\n'.join([DOCUMENT_PROMPT.format(i=i+1, document=d) for i, d in enumerate(all_docs)])
|
|
98
|
+
input_text = template.format(
|
|
99
|
+
context=context,
|
|
100
|
+
query=curr_q
|
|
101
|
+
)
|
|
102
|
+
return input_text, curr_a
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# The following code has been modified from the original source from:
|
|
106
|
+
# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
|
|
107
|
+
# under the same Apache 2.0 license included above.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _text_to_tokens(text: str) -> List[int]:
|
|
111
|
+
return re.split(r"\s+", text.strip())
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def generate_samples(dataset: str, dataset_path: str, template: str, random_seed: int, pre_samples: int, num_samples: int, tokens_to_generate: int, max_seq_length: int, incremental: int = 10, remove_newline_tab: bool = False):
|
|
115
|
+
random.seed(random_seed)
|
|
116
|
+
np.random.seed(random_seed)
|
|
117
|
+
|
|
118
|
+
if dataset == 'squad':
|
|
119
|
+
qas, docs = read_squad(dataset_path)
|
|
120
|
+
elif dataset == 'hotpotqa':
|
|
121
|
+
qas, docs = read_hotpotqa(dataset_path)
|
|
122
|
+
else:
|
|
123
|
+
raise NotImplementedError(f'{dataset} is not implemented.')
|
|
124
|
+
|
|
125
|
+
write_jsons = []
|
|
126
|
+
tokens_to_generate = tokens_to_generate
|
|
127
|
+
|
|
128
|
+
# Find the perfect num_docs
|
|
129
|
+
num_docs = incremental
|
|
130
|
+
|
|
131
|
+
total_tokens = 0 # Track the total tokens generated for this example
|
|
132
|
+
while total_tokens + tokens_to_generate < max_seq_length :
|
|
133
|
+
input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
|
|
134
|
+
# Calculate the number of tokens in the example
|
|
135
|
+
total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
|
|
136
|
+
print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
|
|
137
|
+
if total_tokens + tokens_to_generate > max_seq_length:
|
|
138
|
+
num_docs -= incremental
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
num_docs += incremental
|
|
142
|
+
if num_docs > len(docs):
|
|
143
|
+
num_docs = len(docs)
|
|
144
|
+
break
|
|
145
|
+
print('Number of documents:', num_docs)
|
|
146
|
+
|
|
147
|
+
# Generate samples
|
|
148
|
+
for index in tqdm(range(num_samples)):
|
|
149
|
+
used_docs = num_docs
|
|
150
|
+
while(True):
|
|
151
|
+
try:
|
|
152
|
+
input_text, answer = generate_input_output(index + pre_samples, used_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
|
|
153
|
+
length = len(_text_to_tokens(input_text)) + tokens_to_generate
|
|
154
|
+
assert length <= max_seq_length, f"{length} exceeds max_seq_length."
|
|
155
|
+
break
|
|
156
|
+
except:
|
|
157
|
+
if used_docs > incremental:
|
|
158
|
+
used_docs -= incremental
|
|
159
|
+
|
|
160
|
+
if remove_newline_tab:
|
|
161
|
+
input_text = ' '.join(input_text.replace('\n', ' ').replace('\t', ' ').strip().split())
|
|
162
|
+
|
|
163
|
+
formatted_output = {
|
|
164
|
+
"index": index,
|
|
165
|
+
"input": input_text,
|
|
166
|
+
"outputs": answer,
|
|
167
|
+
"length": length
|
|
168
|
+
}
|
|
169
|
+
write_jsons.append(formatted_output)
|
|
170
|
+
|
|
171
|
+
return write_jsons
|