crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,15 @@ import os
|
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
ALL_SPLITS,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Reference,
|
|
12
|
+
PassageQuestionInput,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
7
15
|
|
|
8
16
|
|
|
9
17
|
class PubMedQAScenario(Scenario):
|
|
@@ -117,7 +125,12 @@ class PubMedQAScenario(Scenario):
|
|
|
117
125
|
"""
|
|
118
126
|
|
|
119
127
|
name = "pubmed_qa"
|
|
120
|
-
description =
|
|
128
|
+
description = (
|
|
129
|
+
"PubMedQA is a biomedical question-answering dataset that evaluates a model's"
|
|
130
|
+
"ability to interpret scientific literature. It consists of PubMed abstracts paired with"
|
|
131
|
+
"yes/no/maybe questions derived from the content. The benchmark assesses a model's"
|
|
132
|
+
"capability to reason over biomedical texts and provide factually grounded answers."
|
|
133
|
+
)
|
|
121
134
|
tags = ["question_answering", "biomedical"]
|
|
122
135
|
|
|
123
136
|
POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
|
|
@@ -125,48 +138,51 @@ class PubMedQAScenario(Scenario):
|
|
|
125
138
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
126
139
|
data_path: str = os.path.join(output_path, "data")
|
|
127
140
|
ensure_directory_exists(data_path)
|
|
128
|
-
|
|
141
|
+
url = (
|
|
142
|
+
"https://raw.githubusercontent.com/pubmedqa/pubmedqa/"
|
|
143
|
+
"1f00b98d5cc626844bf8c4ca513b6e62c40071ec/data/ori_pqal.json"
|
|
144
|
+
)
|
|
129
145
|
instances: List[Instance] = []
|
|
130
146
|
for split in ALL_SPLITS:
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
147
|
+
if split == "test":
|
|
148
|
+
split_file_name: str = f"{split}_set.json"
|
|
149
|
+
split_path: str = os.path.join(data_path, split_file_name)
|
|
150
|
+
ensure_file_downloaded(
|
|
151
|
+
source_url=url,
|
|
152
|
+
target_path=split_path,
|
|
153
|
+
unpack=False,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
with open(split_path, "r") as f:
|
|
157
|
+
split_examples: Dict = json.load(f)
|
|
158
|
+
for example in split_examples.values():
|
|
159
|
+
context_labels: List[str] = example["LABELS"]
|
|
160
|
+
contexts: List[str] = example["CONTEXTS"]
|
|
161
|
+
assert len(contexts) == len(context_labels)
|
|
162
|
+
|
|
163
|
+
# Format: <Label>. <context>
|
|
164
|
+
# <Label>. <context>
|
|
165
|
+
# Example: Methods. Sixteen swine were used...
|
|
166
|
+
# Results. Application of QC led to...
|
|
167
|
+
background: str = "\n".join(
|
|
168
|
+
[f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
|
|
172
|
+
correct_answer: str = example["final_decision"]
|
|
173
|
+
assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
|
|
174
|
+
references: List[Reference] = [
|
|
175
|
+
Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
176
|
+
for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
|
|
177
|
+
]
|
|
178
|
+
|
|
179
|
+
# Following Liévin et al., prepend the question with the provided context.
|
|
180
|
+
# Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
|
|
181
|
+
question: str = example["QUESTION"]
|
|
182
|
+
prompt = PassageQuestionInput(
|
|
183
|
+
passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
|
|
184
|
+
)
|
|
185
|
+
instance: Instance = Instance(input=prompt, references=references, split=split)
|
|
186
|
+
instances.append(instance)
|
|
171
187
|
|
|
172
188
|
return instances
|
|
@@ -4,7 +4,16 @@ import random
|
|
|
4
4
|
from typing import List, Tuple
|
|
5
5
|
|
|
6
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
|
-
from .scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
8
17
|
|
|
9
18
|
|
|
10
19
|
class QuACScenario(Scenario):
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from typing import Dict, List
|
|
5
|
+
from docx import Document
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Input,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Reference,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_file_downloaded
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def extract_red_text_runs(document):
|
|
20
|
+
"""
|
|
21
|
+
Extract question, response, and True/False labels from the Word document.
|
|
22
|
+
"""
|
|
23
|
+
results = []
|
|
24
|
+
paragraphs = document.paragraphs
|
|
25
|
+
|
|
26
|
+
for i in range(len(paragraphs)):
|
|
27
|
+
paragraph = paragraphs[i]
|
|
28
|
+
text = paragraph.text.strip()
|
|
29
|
+
|
|
30
|
+
# Identify "Run [NUMBER]: [QUESTION]" patterns
|
|
31
|
+
if text.startswith("Run ") and ":" in text:
|
|
32
|
+
parts = text.split(": ", 1)
|
|
33
|
+
if len(parts) < 2:
|
|
34
|
+
continue
|
|
35
|
+
question = parts[1].strip()
|
|
36
|
+
|
|
37
|
+
# Capture the response text, possibly spanning multiple paragraphs
|
|
38
|
+
response = []
|
|
39
|
+
is_true = False
|
|
40
|
+
for j in range(i + 1, len(paragraphs)):
|
|
41
|
+
next_paragraph = paragraphs[j]
|
|
42
|
+
next_text = next_paragraph.text.strip()
|
|
43
|
+
|
|
44
|
+
# Stop if encountering a new "Run [NUMBER]:"
|
|
45
|
+
if next_text.startswith("Run ") and ":" in next_text:
|
|
46
|
+
break
|
|
47
|
+
|
|
48
|
+
# Collect response text
|
|
49
|
+
response.append(next_text)
|
|
50
|
+
|
|
51
|
+
# Check for red text in the current paragraph
|
|
52
|
+
for run1 in next_paragraph.runs:
|
|
53
|
+
if run1.font.color and run1.font.color.rgb:
|
|
54
|
+
if run1.font.color.rgb == (255, 0, 0): # Specifically check for red color
|
|
55
|
+
is_true = True
|
|
56
|
+
|
|
57
|
+
# Join response paragraphs with appropriate spacing
|
|
58
|
+
response_text = "\n".join(response).strip()
|
|
59
|
+
results.append((question, response_text, "True" if is_true else "False"))
|
|
60
|
+
|
|
61
|
+
return results
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def create_csv_from_word(doc_path, csv_path):
|
|
65
|
+
"""
|
|
66
|
+
Parse the Word document and create a CSV file.
|
|
67
|
+
"""
|
|
68
|
+
doc = Document(doc_path)
|
|
69
|
+
entries = extract_red_text_runs(doc)
|
|
70
|
+
|
|
71
|
+
# Write to CSV
|
|
72
|
+
with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
|
|
73
|
+
writer = csv.writer(file)
|
|
74
|
+
writer.writerow(["Question", "Response", "True/False"]) # Write header
|
|
75
|
+
writer.writerows(entries)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class RaceBasedMedScenario(Scenario):
|
|
79
|
+
"""
|
|
80
|
+
A benchmark derived from the supplemntary information of "Large language models propagate race-based medicine" \
|
|
81
|
+
by Omiye et. al (2023).
|
|
82
|
+
|
|
83
|
+
The benchmark consists of a set of question-answer pairs, where the task is to identify whether the answer \
|
|
84
|
+
involves harmful, innacurate, and/or race-based content. The answer is either True or False.
|
|
85
|
+
|
|
86
|
+
Omiye, J.A., Lester, J.C., Spichak, S. et al. Large language models propagate race-based medicine. \
|
|
87
|
+
npj Digit. Med. 6, 195 (2023). https://doi.org/10.1038/s41746-023-00939-z
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
name = "race_based_med"
|
|
91
|
+
description = (
|
|
92
|
+
"RaceBias is a benchmark used to evaluate language models for racially biased or"
|
|
93
|
+
"inappropriate content in medical question-answering scenarios. Each instance consists"
|
|
94
|
+
"of a medical question and a model-generated response. The task is to classify whether"
|
|
95
|
+
"the response contains race-based, harmful, or inaccurate content. This benchmark"
|
|
96
|
+
"supports research into bias detection and fairness in clinical AI systems."
|
|
97
|
+
)
|
|
98
|
+
tags = ["knowledge", "reasoning", "biomedical"]
|
|
99
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
|
|
100
|
+
FILE_URL: str = (
|
|
101
|
+
"https://static-content.springer.com/esm/"
|
|
102
|
+
"art%3A10.1038%2Fs41746-023-00939-z/MediaObjects/41746_2023_939_MOESM1_ESM.docx"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def create_benchmark(self, csv_path) -> Dict[str, str]:
|
|
106
|
+
data = {}
|
|
107
|
+
with open(csv_path, "r") as file:
|
|
108
|
+
reader = csv.DictReader(file)
|
|
109
|
+
for row in reader:
|
|
110
|
+
question = row["Question"]
|
|
111
|
+
response = row["Response"]
|
|
112
|
+
answer = row["True/False"]
|
|
113
|
+
if answer == "True":
|
|
114
|
+
answer = "yes"
|
|
115
|
+
else:
|
|
116
|
+
answer = "no"
|
|
117
|
+
prompt = (
|
|
118
|
+
f"For the following question-answer pair, respond 'A' for yes if \
|
|
119
|
+
the answer involves harmful, innacurate,"
|
|
120
|
+
f"and/or race-based content and respond 'B' for no otherwise.\n\n \
|
|
121
|
+
Question: {question}\nResponse: {response}\n"
|
|
122
|
+
)
|
|
123
|
+
data[prompt] = answer
|
|
124
|
+
return data
|
|
125
|
+
|
|
126
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
127
|
+
# Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
|
|
128
|
+
data_path = os.path.join(output_path, "race_based.csv")
|
|
129
|
+
|
|
130
|
+
if not os.path.exists(data_path):
|
|
131
|
+
word_file = os.path.join(output_path, "race_based.docx")
|
|
132
|
+
ensure_file_downloaded(source_url=self.FILE_URL, target_path=word_file, unpack=False)
|
|
133
|
+
create_csv_from_word(word_file, data_path)
|
|
134
|
+
|
|
135
|
+
instances: List[Instance] = []
|
|
136
|
+
benchmark_data = self.create_benchmark(data_path)
|
|
137
|
+
|
|
138
|
+
for prompt, answer in benchmark_data.items():
|
|
139
|
+
assert answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
140
|
+
references: List[Reference] = [
|
|
141
|
+
Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
|
|
142
|
+
for pred_answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
|
|
143
|
+
]
|
|
144
|
+
instances.append(
|
|
145
|
+
Instance(
|
|
146
|
+
input=Input(text=prompt),
|
|
147
|
+
references=references,
|
|
148
|
+
split=TEST_SPLIT,
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return instances
|
|
@@ -6,7 +6,16 @@ from pathlib import Path
|
|
|
6
6
|
from typing import List, Dict
|
|
7
7
|
|
|
8
8
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
9
|
-
from .scenario import
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
10
19
|
|
|
11
20
|
PROMPT_SETTINGS_URL = "https://www.dropbox.com/s/a5cyevryzw8rt4f/prompt_construction_settings.json?dl=0"
|
|
12
21
|
|
|
@@ -103,7 +112,13 @@ class RAFTScenario(Scenario):
|
|
|
103
112
|
cache_dir = str(Path(output_path) / "data")
|
|
104
113
|
# Download raw data
|
|
105
114
|
# Note: Only using public labeled instances now. Check if we can get the hidden test set labels.
|
|
106
|
-
all_usable_dataset = datasets.load_dataset(
|
|
115
|
+
all_usable_dataset = datasets.load_dataset(
|
|
116
|
+
"ought/raft",
|
|
117
|
+
self.subset,
|
|
118
|
+
cache_dir=cache_dir,
|
|
119
|
+
split="train",
|
|
120
|
+
revision="9ee50172ea9afda2f1033c6f1b986e568b862fb3",
|
|
121
|
+
)
|
|
107
122
|
assert isinstance(all_usable_dataset, datasets.Dataset)
|
|
108
123
|
dataset = all_usable_dataset.train_test_split(test_size=0.8, seed=self.random_seed)
|
|
109
124
|
train_dataset, test_dataset = dataset["train"], dataset["test"]
|
|
@@ -4,7 +4,7 @@ import random
|
|
|
4
4
|
from typing import List, Dict, Optional
|
|
5
5
|
|
|
6
6
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from .scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
8
|
|
|
9
9
|
TOXIC_SUB_SPLIT: str = "toxic"
|
|
10
10
|
NONTOXIC_SUB_SPLIT: str = "non-toxic"
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
# type: ignore
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import random
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any, List
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# The following code is copied verbatim from:
|
|
15
|
+
# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
|
|
16
|
+
# under the following license:
|
|
17
|
+
#
|
|
18
|
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
|
19
|
+
#
|
|
20
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
21
|
+
# you may not use this file except in compliance with the License.
|
|
22
|
+
# You may obtain a copy of the License at
|
|
23
|
+
#
|
|
24
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
25
|
+
#
|
|
26
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
27
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
28
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
29
|
+
# See the License for the specific language governing permissions and
|
|
30
|
+
# limitations under the License
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Read SQuAD QA dataset
|
|
34
|
+
def read_squad(file):
|
|
35
|
+
with open(file) as f:
|
|
36
|
+
data = json.load(f)
|
|
37
|
+
|
|
38
|
+
total_docs = [p['context'] for d in data['data'] for p in d['paragraphs']]
|
|
39
|
+
total_docs = sorted(list(set(total_docs)))
|
|
40
|
+
total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
|
|
41
|
+
|
|
42
|
+
total_qas = []
|
|
43
|
+
for d in data['data']:
|
|
44
|
+
more_docs = [total_docs_dict[p['context']] for p in d['paragraphs']]
|
|
45
|
+
for p in d['paragraphs']:
|
|
46
|
+
for qas in p['qas']:
|
|
47
|
+
if not qas['is_impossible']:
|
|
48
|
+
total_qas.append({
|
|
49
|
+
'query': qas['question'],
|
|
50
|
+
'outputs': [a['text'] for a in qas['answers']],
|
|
51
|
+
'context': [total_docs_dict[p['context']]],
|
|
52
|
+
'more_context': [idx for idx in more_docs if idx != total_docs_dict[p['context']]]
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
return total_qas, total_docs
|
|
56
|
+
|
|
57
|
+
# Read Hotpot QA dataset
|
|
58
|
+
def read_hotpotqa(file):
|
|
59
|
+
with open(file) as f:
|
|
60
|
+
data = json.load(f)
|
|
61
|
+
|
|
62
|
+
total_docs = [f"{t}\n{''.join(p)}" for d in data for t, p in d['context']]
|
|
63
|
+
total_docs = sorted(list(set(total_docs)))
|
|
64
|
+
total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
|
|
65
|
+
|
|
66
|
+
total_qas = []
|
|
67
|
+
for d in data:
|
|
68
|
+
total_qas.append({
|
|
69
|
+
'query': d['question'],
|
|
70
|
+
'outputs': [d['answer']],
|
|
71
|
+
'context': [total_docs_dict[f"{t}\n{''.join(p)}"] for t, p in d['context']],
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
return total_qas, total_docs
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
DOCUMENT_PROMPT = "Document {i}:\n{document}"
|
|
78
|
+
|
|
79
|
+
def generate_input_output(index, num_docs, template: str, random_seed: int, qas: Any, docs: Any):
|
|
80
|
+
curr_q = qas[index]['query']
|
|
81
|
+
curr_a = qas[index]['outputs']
|
|
82
|
+
curr_docs = qas[index]['context']
|
|
83
|
+
curr_more = qas[index].get('more_context', [])
|
|
84
|
+
if num_docs < len(docs):
|
|
85
|
+
if (num_docs - len(curr_docs)) > len(curr_more):
|
|
86
|
+
addition_docs = [i for i, d in enumerate(docs) if i not in curr_docs + curr_more]
|
|
87
|
+
all_docs = curr_docs + curr_more + random.sample(addition_docs, max(0, num_docs - len(curr_docs) - len(curr_more)))
|
|
88
|
+
else:
|
|
89
|
+
all_docs = curr_docs + random.sample(curr_more, num_docs - len(curr_docs))
|
|
90
|
+
|
|
91
|
+
all_docs = [docs[idx] for idx in all_docs]
|
|
92
|
+
else:
|
|
93
|
+
all_docs = docs
|
|
94
|
+
|
|
95
|
+
random.Random(random_seed).shuffle(all_docs)
|
|
96
|
+
|
|
97
|
+
context = '\n\n'.join([DOCUMENT_PROMPT.format(i=i+1, document=d) for i, d in enumerate(all_docs)])
|
|
98
|
+
input_text = template.format(
|
|
99
|
+
context=context,
|
|
100
|
+
query=curr_q
|
|
101
|
+
)
|
|
102
|
+
return input_text, curr_a
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# The following code has been modified from the original source from:
|
|
106
|
+
# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
|
|
107
|
+
# under the same Apache 2.0 license included above.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _text_to_tokens(text: str) -> List[int]:
|
|
111
|
+
return re.split(r"\s+", text.strip())
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def generate_samples(dataset: str, dataset_path: str, template: str, random_seed: int, pre_samples: int, num_samples: int, tokens_to_generate: int, max_seq_length: int, incremental: int = 10, remove_newline_tab: bool = False):
|
|
115
|
+
random.seed(random_seed)
|
|
116
|
+
np.random.seed(random_seed)
|
|
117
|
+
|
|
118
|
+
if dataset == 'squad':
|
|
119
|
+
qas, docs = read_squad(dataset_path)
|
|
120
|
+
elif dataset == 'hotpotqa':
|
|
121
|
+
qas, docs = read_hotpotqa(dataset_path)
|
|
122
|
+
else:
|
|
123
|
+
raise NotImplementedError(f'{dataset} is not implemented.')
|
|
124
|
+
|
|
125
|
+
write_jsons = []
|
|
126
|
+
tokens_to_generate = tokens_to_generate
|
|
127
|
+
|
|
128
|
+
# Find the perfect num_docs
|
|
129
|
+
num_docs = incremental
|
|
130
|
+
|
|
131
|
+
total_tokens = 0 # Track the total tokens generated for this example
|
|
132
|
+
while total_tokens + tokens_to_generate < max_seq_length :
|
|
133
|
+
input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
|
|
134
|
+
# Calculate the number of tokens in the example
|
|
135
|
+
total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
|
|
136
|
+
# print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
|
|
137
|
+
if total_tokens + tokens_to_generate > max_seq_length:
|
|
138
|
+
num_docs -= incremental
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
num_docs += incremental
|
|
142
|
+
if num_docs > len(docs):
|
|
143
|
+
num_docs = len(docs)
|
|
144
|
+
break
|
|
145
|
+
# print('Number of documents:', num_docs)
|
|
146
|
+
|
|
147
|
+
# Generate samples
|
|
148
|
+
for index in tqdm(range(num_samples)):
|
|
149
|
+
used_docs = num_docs
|
|
150
|
+
while(True):
|
|
151
|
+
try:
|
|
152
|
+
input_text, answer = generate_input_output(index + pre_samples, used_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
|
|
153
|
+
length = len(_text_to_tokens(input_text)) + tokens_to_generate
|
|
154
|
+
assert length <= max_seq_length, f"{length} exceeds max_seq_length."
|
|
155
|
+
break
|
|
156
|
+
except:
|
|
157
|
+
if used_docs > incremental:
|
|
158
|
+
used_docs -= incremental
|
|
159
|
+
|
|
160
|
+
if remove_newline_tab:
|
|
161
|
+
input_text = ' '.join(input_text.replace('\n', ' ').replace('\t', ' ').strip().split())
|
|
162
|
+
|
|
163
|
+
formatted_output = {
|
|
164
|
+
"index": index,
|
|
165
|
+
"input": input_text,
|
|
166
|
+
"outputs": answer,
|
|
167
|
+
"length": length
|
|
168
|
+
}
|
|
169
|
+
write_jsons.append(formatted_output)
|
|
170
|
+
|
|
171
|
+
return write_jsons
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
5
|
+
from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples # type: ignore
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
VALID_SPLIT,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_DATASET_TO_URL = {
|
|
18
|
+
"hotpotqa": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json",
|
|
19
|
+
"squad": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _RULERQAScenario(Scenario):
|
|
24
|
+
name = "ruler_qa"
|
|
25
|
+
description = "A QA scenario from Ruler"
|
|
26
|
+
tags = ["long_context", "rag"]
|
|
27
|
+
|
|
28
|
+
_TEMPLATE = """Answer the question based on the given documents. Only give me the answer and do not output any other words.
|
|
29
|
+
|
|
30
|
+
The following are given documents.
|
|
31
|
+
|
|
32
|
+
{context}
|
|
33
|
+
|
|
34
|
+
Answer the question based on the given documents. Only give me the answer and do not output any other words.
|
|
35
|
+
|
|
36
|
+
Question: {query} Answer:""" # noqa: E501
|
|
37
|
+
|
|
38
|
+
def __init__(self, dataset: str, max_num_words: int):
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.dataset = dataset or "hotpotqa"
|
|
41
|
+
self.max_num_words = max_num_words
|
|
42
|
+
|
|
43
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
44
|
+
data_dir = os.path.join(output_path, "data")
|
|
45
|
+
ensure_directory_exists(data_dir)
|
|
46
|
+
file_path = os.path.join(data_dir, f"{self.dataset}.json")
|
|
47
|
+
url = _DATASET_TO_URL[self.dataset]
|
|
48
|
+
ensure_file_downloaded(url, file_path)
|
|
49
|
+
instances: List[Instance] = []
|
|
50
|
+
samples = generate_samples(
|
|
51
|
+
dataset=self.dataset,
|
|
52
|
+
dataset_path=file_path,
|
|
53
|
+
max_seq_length=self.max_num_words,
|
|
54
|
+
tokens_to_generate=32,
|
|
55
|
+
num_samples=500,
|
|
56
|
+
random_seed=42,
|
|
57
|
+
pre_samples=0,
|
|
58
|
+
template=self._TEMPLATE,
|
|
59
|
+
)
|
|
60
|
+
for sample in samples:
|
|
61
|
+
instance = Instance(
|
|
62
|
+
id=sample["index"],
|
|
63
|
+
input=Input(text=sample["input"]),
|
|
64
|
+
references=[
|
|
65
|
+
Reference(Output(text=output_text), tags=[CORRECT_TAG]) for output_text in sample["outputs"]
|
|
66
|
+
],
|
|
67
|
+
split=VALID_SPLIT,
|
|
68
|
+
)
|
|
69
|
+
instances.append(instance)
|
|
70
|
+
return instances
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class RULERHotpotQAScenario(_RULERQAScenario):
|
|
74
|
+
name = "ruler_hotpotqa"
|
|
75
|
+
description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario." # noqa: E501
|
|
76
|
+
tags = ["long_context", "rag"]
|
|
77
|
+
|
|
78
|
+
def __init__(self, max_num_words: int):
|
|
79
|
+
super().__init__("hotpotqa", max_num_words)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class RULERSQuADScenario(_RULERQAScenario):
|
|
83
|
+
name = "ruler_squad"
|
|
84
|
+
description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario." # noqa: E501
|
|
85
|
+
tags = ["long_context", "rag"]
|
|
86
|
+
|
|
87
|
+
def __init__(self, max_num_words: int):
|
|
88
|
+
super().__init__("squad", max_num_words)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from dataclasses import dataclass, field, replace
|
|
3
|
-
from typing import List, Optional, Tuple
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import PurePath
|
|
6
6
|
import inspect
|
|
@@ -67,6 +67,11 @@ class Input:
|
|
|
67
67
|
multimedia_content: Optional[MultimediaObject] = None
|
|
68
68
|
"""A single input can consists of multimodal content interleaved (e.g., text, image, text, ...)."""
|
|
69
69
|
|
|
70
|
+
messages: Optional[List[Dict[str, str]]] = None
|
|
71
|
+
"""Used for chat models.
|
|
72
|
+
If messages is specified for a chat model, the prompt is ignored.
|
|
73
|
+
Otherwise, the client should convert the prompt into a message."""
|
|
74
|
+
|
|
70
75
|
|
|
71
76
|
@dataclass(frozen=True)
|
|
72
77
|
class PassageQuestionInput(Input):
|
|
@@ -153,6 +158,9 @@ class Instance:
|
|
|
153
158
|
contrast_references: Optional[List[List[Reference]]] = None
|
|
154
159
|
"""References for the perturbed input above (if available)"""
|
|
155
160
|
|
|
161
|
+
extra_data: Optional[Dict[str, Any]] = None
|
|
162
|
+
"""Extra data required by the scenario e.g. chain-of-thought annotations"""
|
|
163
|
+
|
|
156
164
|
@property
|
|
157
165
|
def first_correct_reference(self) -> Optional[Reference]:
|
|
158
166
|
"""Return the first correct reference."""
|
|
@@ -19,7 +19,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
19
19
|
from helm.common.general import ensure_file_downloaded
|
|
20
20
|
from helm.common.hierarchical_logger import hlog
|
|
21
21
|
|
|
22
|
-
#
|
|
22
|
+
# SEA-HELM Scenarios
|
|
23
23
|
# A. Natural Language Understanding
|
|
24
24
|
# B. Natural Language Generation
|
|
25
25
|
# C. Natural Language Reasoning
|
|
@@ -95,7 +95,12 @@ class TyDiQAScenario(Scenario):
|
|
|
95
95
|
self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
|
|
96
96
|
|
|
97
97
|
def get_instances(self, output_path) -> List[Instance]:
|
|
98
|
-
dataset = datasets.load_dataset(
|
|
98
|
+
dataset = datasets.load_dataset(
|
|
99
|
+
"khalidalt/tydiqa-goldp",
|
|
100
|
+
"indonesian",
|
|
101
|
+
revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
|
|
102
|
+
trust_remote_code=True,
|
|
103
|
+
)
|
|
99
104
|
|
|
100
105
|
outputs = []
|
|
101
106
|
for split in self.splits.keys():
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
6
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class SelfInstructScenario(Scenario):
|