crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
5
|
+
from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples # type: ignore
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
VALID_SPLIT,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_DATASET_TO_URL = {
|
|
18
|
+
"hotpotqa": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json",
|
|
19
|
+
"squad": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class _RULERQAScenario(Scenario):
|
|
24
|
+
name = "ruler_qa"
|
|
25
|
+
description = "A QA scenario from Ruler"
|
|
26
|
+
tags = ["long_context", "rag"]
|
|
27
|
+
|
|
28
|
+
_TEMPLATE = """Answer the question based on the given documents. Only give me the answer and do not output any other words.
|
|
29
|
+
|
|
30
|
+
The following are given documents.
|
|
31
|
+
|
|
32
|
+
{context}
|
|
33
|
+
|
|
34
|
+
Answer the question based on the given documents. Only give me the answer and do not output any other words.
|
|
35
|
+
|
|
36
|
+
Question: {query} Answer:""" # noqa: E501
|
|
37
|
+
|
|
38
|
+
def __init__(self, dataset: str, max_num_words: int):
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.dataset = dataset or "hotpotqa"
|
|
41
|
+
self.max_num_words = max_num_words
|
|
42
|
+
|
|
43
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
44
|
+
data_dir = os.path.join(output_path, "data")
|
|
45
|
+
ensure_directory_exists(data_dir)
|
|
46
|
+
file_path = os.path.join(data_dir, f"{self.dataset}.json")
|
|
47
|
+
url = _DATASET_TO_URL[self.dataset]
|
|
48
|
+
ensure_file_downloaded(url, file_path)
|
|
49
|
+
instances: List[Instance] = []
|
|
50
|
+
samples = generate_samples(
|
|
51
|
+
dataset=self.dataset,
|
|
52
|
+
dataset_path=file_path,
|
|
53
|
+
max_seq_length=self.max_num_words,
|
|
54
|
+
tokens_to_generate=32,
|
|
55
|
+
num_samples=500,
|
|
56
|
+
random_seed=42,
|
|
57
|
+
pre_samples=0,
|
|
58
|
+
template=self._TEMPLATE,
|
|
59
|
+
)
|
|
60
|
+
for sample in samples:
|
|
61
|
+
instance = Instance(
|
|
62
|
+
id=sample["index"],
|
|
63
|
+
input=Input(text=sample["input"]),
|
|
64
|
+
references=[
|
|
65
|
+
Reference(Output(text=output_text), tags=[CORRECT_TAG]) for output_text in sample["outputs"]
|
|
66
|
+
],
|
|
67
|
+
split=VALID_SPLIT,
|
|
68
|
+
)
|
|
69
|
+
instances.append(instance)
|
|
70
|
+
return instances
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class RULERHotpotQAScenario(_RULERQAScenario):
|
|
74
|
+
name = "ruler_hotpotqa"
|
|
75
|
+
description = "The HotpotQA long-context multi-hop RAG question answering scenario from RULER"
|
|
76
|
+
tags = ["long_context", "rag"]
|
|
77
|
+
|
|
78
|
+
def __init__(self, max_num_words: int):
|
|
79
|
+
super().__init__("hotpotqa", max_num_words)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class RULERSQuADScenario(_RULERQAScenario):
|
|
83
|
+
name = "ruler_squad"
|
|
84
|
+
description = "The SQuAD question answering scenario from RULER"
|
|
85
|
+
tags = ["long_context", "rag"]
|
|
86
|
+
|
|
87
|
+
def __init__(self, max_num_words: int):
|
|
88
|
+
super().__init__("squad", max_num_words)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from dataclasses import dataclass, field, replace
|
|
3
|
-
from typing import List, Optional, Tuple
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import PurePath
|
|
6
6
|
import inspect
|
|
@@ -67,6 +67,11 @@ class Input:
|
|
|
67
67
|
multimedia_content: Optional[MultimediaObject] = None
|
|
68
68
|
"""A single input can consists of multimodal content interleaved (e.g., text, image, text, ...)."""
|
|
69
69
|
|
|
70
|
+
messages: Optional[List[Dict[str, str]]] = None
|
|
71
|
+
"""Used for chat models.
|
|
72
|
+
If messages is specified for a chat model, the prompt is ignored.
|
|
73
|
+
Otherwise, the client should convert the prompt into a message."""
|
|
74
|
+
|
|
70
75
|
|
|
71
76
|
@dataclass(frozen=True)
|
|
72
77
|
class PassageQuestionInput(Input):
|
|
@@ -153,6 +158,9 @@ class Instance:
|
|
|
153
158
|
contrast_references: Optional[List[List[Reference]]] = None
|
|
154
159
|
"""References for the perturbed input above (if available)"""
|
|
155
160
|
|
|
161
|
+
extra_data: Optional[Dict[str, Any]] = None
|
|
162
|
+
"""Extra data required by the scenario e.g. chain-of-thought annotations"""
|
|
163
|
+
|
|
156
164
|
@property
|
|
157
165
|
def first_correct_reference(self) -> Optional[Reference]:
|
|
158
166
|
"""Return the first correct reference."""
|
|
@@ -19,7 +19,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
19
19
|
from helm.common.general import ensure_file_downloaded
|
|
20
20
|
from helm.common.hierarchical_logger import hlog
|
|
21
21
|
|
|
22
|
-
#
|
|
22
|
+
# SEA-HELM Scenarios
|
|
23
23
|
# A. Natural Language Understanding
|
|
24
24
|
# B. Natural Language Generation
|
|
25
25
|
# C. Natural Language Reasoning
|
|
@@ -95,7 +95,12 @@ class TyDiQAScenario(Scenario):
|
|
|
95
95
|
self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
|
|
96
96
|
|
|
97
97
|
def get_instances(self, output_path) -> List[Instance]:
|
|
98
|
-
dataset = datasets.load_dataset(
|
|
98
|
+
dataset = datasets.load_dataset(
|
|
99
|
+
"khalidalt/tydiqa-goldp",
|
|
100
|
+
"indonesian",
|
|
101
|
+
revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
|
|
102
|
+
trust_remote_code=True,
|
|
103
|
+
)
|
|
99
104
|
|
|
100
105
|
outputs = []
|
|
101
106
|
for split in self.splits.keys():
|
|
@@ -171,7 +176,7 @@ class XQuADScenario(Scenario):
|
|
|
171
176
|
super().__init__()
|
|
172
177
|
self.language = language
|
|
173
178
|
self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
174
|
-
self.
|
|
179
|
+
self.language_to_prompt_components = {
|
|
175
180
|
"th": {
|
|
176
181
|
"passage_prefix": "ข้อความ: ",
|
|
177
182
|
"question_prefix": "คำถาม: ",
|
|
@@ -183,13 +188,19 @@ class XQuADScenario(Scenario):
|
|
|
183
188
|
"random_state": 4502,
|
|
184
189
|
},
|
|
185
190
|
}
|
|
191
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
192
|
+
raise Exception(
|
|
193
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
186
197
|
|
|
187
198
|
def get_instances(self, output_path) -> List[Instance]:
|
|
188
199
|
dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
|
|
189
200
|
df = dataset.to_pandas()
|
|
190
201
|
|
|
191
202
|
# Sample 1000 examples for test
|
|
192
|
-
df_test = df.sample(n=1000, random_state=self.
|
|
203
|
+
df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
|
|
193
204
|
|
|
194
205
|
# In-context examples to be drawn from remaining examples (since there is no train data)
|
|
195
206
|
df_train = df[~df.index.isin(df_test.index)]
|
|
@@ -210,8 +221,8 @@ class XQuADScenario(Scenario):
|
|
|
210
221
|
input = PassageQuestionInput(
|
|
211
222
|
passage=passage,
|
|
212
223
|
question=question,
|
|
213
|
-
passage_prefix=str(self.
|
|
214
|
-
question_prefix=str(self.
|
|
224
|
+
passage_prefix=str(self.prompt_components["passage_prefix"]),
|
|
225
|
+
question_prefix=str(self.prompt_components["question_prefix"]),
|
|
215
226
|
)
|
|
216
227
|
references = []
|
|
217
228
|
for answer in row["answers"]["text"]:
|
|
@@ -1068,6 +1079,9 @@ class FloresScenario(Scenario):
|
|
|
1068
1079
|
"ta": "tam_Taml",
|
|
1069
1080
|
}
|
|
1070
1081
|
|
|
1082
|
+
if self.source not in self.languages.keys() or self.target not in self.languages.keys():
|
|
1083
|
+
raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
|
|
1084
|
+
|
|
1071
1085
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1072
1086
|
source_dataset = datasets.load_dataset(
|
|
1073
1087
|
"facebook/flores",
|
|
@@ -1259,6 +1273,9 @@ class XNLIScenario(Scenario):
|
|
|
1259
1273
|
"test": TEST_SPLIT,
|
|
1260
1274
|
}
|
|
1261
1275
|
self.id2label = {0: "A", 2: "B", 1: "C"}
|
|
1276
|
+
self.supported_languages = ["th", "vi"]
|
|
1277
|
+
if self.language not in self.supported_languages:
|
|
1278
|
+
raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
|
|
1262
1279
|
|
|
1263
1280
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1264
1281
|
dataset = datasets.load_dataset("xnli", self.language)
|
|
@@ -1449,7 +1466,7 @@ class XCOPAScenario(Scenario):
|
|
|
1449
1466
|
0: "A",
|
|
1450
1467
|
1: "B",
|
|
1451
1468
|
}
|
|
1452
|
-
self.
|
|
1469
|
+
self.language_to_prompt_components = {
|
|
1453
1470
|
"id": {
|
|
1454
1471
|
"cause": "sebab",
|
|
1455
1472
|
"effect": "akibat",
|
|
@@ -1476,6 +1493,12 @@ class XCOPAScenario(Scenario):
|
|
|
1476
1493
|
"instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
|
|
1477
1494
|
},
|
|
1478
1495
|
}
|
|
1496
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1497
|
+
raise Exception(
|
|
1498
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1499
|
+
)
|
|
1500
|
+
else:
|
|
1501
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1479
1502
|
|
|
1480
1503
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1481
1504
|
language_dataset = datasets.load_dataset("xcopa", self.language)
|
|
@@ -1489,15 +1512,13 @@ class XCOPAScenario(Scenario):
|
|
|
1489
1512
|
language_df, tamil_df[["question", "idx"]], on="idx"
|
|
1490
1513
|
) # Use the Tamil split's question column
|
|
1491
1514
|
for _, row in data.iterrows():
|
|
1492
|
-
instruction1 = self.
|
|
1493
|
-
self.prompt[self.language][row["question_y"]]
|
|
1494
|
-
)
|
|
1515
|
+
instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
|
|
1495
1516
|
passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
|
|
1496
1517
|
premise=row["premise"].strip(),
|
|
1497
1518
|
instruction1=instruction1,
|
|
1498
1519
|
choice1=row["choice1"].strip(),
|
|
1499
1520
|
choice2=row["choice2"].strip(),
|
|
1500
|
-
instruction2=self.
|
|
1521
|
+
instruction2=self.prompt_components["instruction2"],
|
|
1501
1522
|
)
|
|
1502
1523
|
input = Input(passage)
|
|
1503
1524
|
output = Output(self.id2label[int(row["label"])])
|
|
@@ -1549,18 +1570,24 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1549
1570
|
|
|
1550
1571
|
name = "lindsea_minimal_pairs"
|
|
1551
1572
|
description = "LINDSEA minimal pairs task"
|
|
1552
|
-
tags = ["
|
|
1573
|
+
tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
|
|
1553
1574
|
|
|
1554
1575
|
def __init__(self, method: str, language: str):
|
|
1555
1576
|
super().__init__()
|
|
1556
1577
|
self.method = method
|
|
1557
1578
|
self.language = language
|
|
1558
|
-
self.
|
|
1579
|
+
self.language_to_prompt_components = {
|
|
1559
1580
|
"id": {
|
|
1560
1581
|
"instructions": "Kalimat mana yang lebih mungkin?",
|
|
1561
1582
|
"output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
|
|
1562
1583
|
}
|
|
1563
1584
|
}
|
|
1585
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1586
|
+
raise Exception(
|
|
1587
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1588
|
+
)
|
|
1589
|
+
else:
|
|
1590
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1564
1591
|
|
|
1565
1592
|
def download_dataset(self, output_path: str):
|
|
1566
1593
|
BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
|
|
@@ -1586,6 +1613,7 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1586
1613
|
outputs = []
|
|
1587
1614
|
if self.method == "mcq":
|
|
1588
1615
|
category_list = data["category"].value_counts().keys()
|
|
1616
|
+
|
|
1589
1617
|
hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
|
|
1590
1618
|
for category in category_list:
|
|
1591
1619
|
# Fix shuffling within each category
|
|
@@ -1594,10 +1622,8 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1594
1622
|
options = [(row["correct"], 1), (row["wrong"], 2)]
|
|
1595
1623
|
random.shuffle(options)
|
|
1596
1624
|
options_reversed = True if options[0][1] == 2 else False
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
instructions = prompt_components["instructions"]
|
|
1600
|
-
output_prefix = prompt_components["output_prefix"]
|
|
1625
|
+
instructions = self.prompt_components["instructions"]
|
|
1626
|
+
output_prefix = self.prompt_components["output_prefix"]
|
|
1601
1627
|
prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
|
|
1602
1628
|
input = Input(text=prompt)
|
|
1603
1629
|
# Determine correct option based on whether shuffling reversed the options
|
|
@@ -1625,23 +1651,31 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1625
1651
|
return outputs
|
|
1626
1652
|
|
|
1627
1653
|
|
|
1628
|
-
# 2. Pragmatics
|
|
1629
|
-
|
|
1630
|
-
class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
|
|
1654
|
+
# 2.1 Pragmatics: LINDSEA Presuppositions
|
|
1655
|
+
class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
|
|
1631
1656
|
"""
|
|
1632
|
-
The LINDSEA
|
|
1657
|
+
The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
|
|
1633
1658
|
The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
|
|
1634
|
-
of quality control.
|
|
1659
|
+
of quality control.
|
|
1635
1660
|
|
|
1636
|
-
The
|
|
1637
|
-
|
|
1661
|
+
The presuppositions dataset involves two formats: single and pair sentences.
|
|
1662
|
+
For single sentence questions, the system under test needs to determine if the sentence is true/false.
|
|
1663
|
+
For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
|
|
1664
|
+
from another sentence.
|
|
1638
1665
|
|
|
1639
|
-
|
|
1666
|
+
For the single format, the models are prompted using the following general format:
|
|
1640
1667
|
|
|
1641
1668
|
Is the following statement true or false?
|
|
1642
1669
|
Statement: <sentence>
|
|
1643
1670
|
Answer only with True or False.
|
|
1644
1671
|
|
|
1672
|
+
For the pair format, the models are prompted using the following general format:
|
|
1673
|
+
|
|
1674
|
+
Situation: <premise>
|
|
1675
|
+
Given this situation, is the following statement true or false?
|
|
1676
|
+
Statement: <hypothesis>
|
|
1677
|
+
Answer only with True or False.
|
|
1678
|
+
|
|
1645
1679
|
Target completion:
|
|
1646
1680
|
<answer>
|
|
1647
1681
|
|
|
@@ -1661,50 +1695,101 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
|
|
|
1661
1695
|
}
|
|
1662
1696
|
"""
|
|
1663
1697
|
|
|
1664
|
-
name = "
|
|
1665
|
-
description = "LINDSEA
|
|
1666
|
-
tags = ["
|
|
1698
|
+
name = "lindsea_pragmatics_presuppositions"
|
|
1699
|
+
description = "LINDSEA presuppositions task"
|
|
1700
|
+
tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
|
|
1667
1701
|
|
|
1668
|
-
def __init__(self, language: str):
|
|
1702
|
+
def __init__(self, language: str, subset: str):
|
|
1669
1703
|
super().__init__()
|
|
1670
1704
|
self.language = language
|
|
1671
|
-
self.
|
|
1705
|
+
self.subsets = [subset] if subset != "all" else ["single", "pair"]
|
|
1706
|
+
self.language_to_prompt_components = {
|
|
1672
1707
|
"id": {
|
|
1673
|
-
"
|
|
1674
|
-
"
|
|
1708
|
+
"text_noun": "Pernyataan",
|
|
1709
|
+
"premise_noun": "Situasi",
|
|
1710
|
+
"conclusion_noun": "Pernyataan",
|
|
1711
|
+
"single_question": "Apakah pernyataan berikut ini {}?",
|
|
1712
|
+
"single_instruction": "Jawablah dengan {} saja.",
|
|
1713
|
+
"pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
|
|
1714
|
+
"pair_instruction": "Jawablah dengan Benar atau Salah saja.",
|
|
1715
|
+
"True": "Benar",
|
|
1716
|
+
"False": "Salah",
|
|
1675
1717
|
},
|
|
1676
1718
|
}
|
|
1719
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1720
|
+
raise Exception(
|
|
1721
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1722
|
+
)
|
|
1723
|
+
else:
|
|
1724
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1677
1725
|
|
|
1678
1726
|
def download_dataset(self, output_path: str):
|
|
1679
1727
|
BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1728
|
+
datasets = []
|
|
1729
|
+
for subset in self.subsets:
|
|
1730
|
+
URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
|
|
1731
|
+
file = f"pragmatic_reasoning_{subset}.jsonl"
|
|
1732
|
+
target_path_file = os.path.join(output_path, file)
|
|
1733
|
+
ensure_file_downloaded(source_url=URL, target_path=target_path_file)
|
|
1734
|
+
data = pd.read_json(target_path_file, lines=True)
|
|
1735
|
+
data["subset"] = subset
|
|
1736
|
+
data = data[data["linguistic_phenomenon"] == "presuppositions"]
|
|
1737
|
+
datasets.append(data)
|
|
1738
|
+
dataset = pd.concat(datasets)
|
|
1685
1739
|
return dataset
|
|
1686
1740
|
|
|
1687
1741
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1688
1742
|
data = self.download_dataset(output_path)
|
|
1689
1743
|
outputs = []
|
|
1690
1744
|
for _, row in data.iterrows():
|
|
1691
|
-
passage =
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1745
|
+
passage = None
|
|
1746
|
+
references = []
|
|
1747
|
+
|
|
1748
|
+
if row["subset"] == "single":
|
|
1749
|
+
question = self.prompt_components["single_question"]
|
|
1750
|
+
text_noun = self.prompt_components["text_noun"]
|
|
1751
|
+
instruction = self.prompt_components["single_instruction"]
|
|
1752
|
+
|
|
1753
|
+
passage = "{question}\{text_noun}: {text}\n{instruction}".format(
|
|
1754
|
+
question=question.format(row["question_translated"]),
|
|
1755
|
+
text_noun=text_noun,
|
|
1756
|
+
text=row["text"],
|
|
1757
|
+
instruction=instruction.format(row["choices_translated"]),
|
|
1758
|
+
)
|
|
1759
|
+
# Split "True or False" into ["True", "or", "False"]
|
|
1760
|
+
choices = row["choices"].split()
|
|
1761
|
+
choices_translated = row["choices_translated"].split()
|
|
1762
|
+
label2choice = {
|
|
1763
|
+
choices[0]: choices_translated[0],
|
|
1764
|
+
choices[2]: choices_translated[2],
|
|
1765
|
+
}
|
|
1766
|
+
references.append(
|
|
1767
|
+
Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
|
|
1768
|
+
)
|
|
1769
|
+
|
|
1770
|
+
elif row["subset"] == "pair":
|
|
1771
|
+
premise_noun = self.prompt_components["premise_noun"]
|
|
1772
|
+
question = self.prompt_components["pair_question"]
|
|
1773
|
+
conclusion_noun = self.prompt_components["conclusion_noun"]
|
|
1774
|
+
instruction = self.prompt_components["pair_instruction"]
|
|
1775
|
+
label = self.prompt_components[str(row["label"])]
|
|
1776
|
+
|
|
1777
|
+
passage = (
|
|
1778
|
+
"{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
|
|
1779
|
+
premise_noun=premise_noun,
|
|
1780
|
+
premise=row["text"],
|
|
1781
|
+
question=question,
|
|
1782
|
+
conclusion_noun=conclusion_noun,
|
|
1783
|
+
conclusion=row["conclusion"],
|
|
1784
|
+
instruction=instruction,
|
|
1785
|
+
)
|
|
1786
|
+
)
|
|
1787
|
+
|
|
1788
|
+
references.append(
|
|
1789
|
+
Reference(Output(text=label), tags=[CORRECT_TAG]),
|
|
1790
|
+
)
|
|
1791
|
+
|
|
1792
|
+
input = Input(text=str(passage))
|
|
1708
1793
|
instance = Instance(
|
|
1709
1794
|
input=input,
|
|
1710
1795
|
references=references,
|
|
@@ -1714,17 +1799,25 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
|
|
|
1714
1799
|
return outputs
|
|
1715
1800
|
|
|
1716
1801
|
|
|
1717
|
-
# 2.2 Pragmatics: LINDSEA
|
|
1718
|
-
class
|
|
1802
|
+
# 2.2 Pragmatics: LINDSEA Scalar Implicatures
|
|
1803
|
+
class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
|
|
1719
1804
|
"""
|
|
1720
|
-
The LINDSEA
|
|
1805
|
+
The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
|
|
1721
1806
|
The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
|
|
1722
|
-
of quality control.
|
|
1807
|
+
of quality control.
|
|
1723
1808
|
|
|
1724
|
-
The
|
|
1809
|
+
The scalar implicatures dataset involves two formats: single and pair sentences.
|
|
1810
|
+
For single sentence questions, the system under test needs to determine if the sentence is true/false.
|
|
1811
|
+
For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
|
|
1725
1812
|
from another sentence.
|
|
1726
1813
|
|
|
1727
|
-
|
|
1814
|
+
For the single format, the models are prompted using the following general format:
|
|
1815
|
+
|
|
1816
|
+
Is the following statement true or false?
|
|
1817
|
+
Statement: <sentence>
|
|
1818
|
+
Answer only with True or False.
|
|
1819
|
+
|
|
1820
|
+
For the pair format, the models are prompted using the following general format:
|
|
1728
1821
|
|
|
1729
1822
|
Situation: <premise>
|
|
1730
1823
|
Given this situation, is the following statement true or false?
|
|
@@ -1750,45 +1843,101 @@ class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
|
|
|
1750
1843
|
}
|
|
1751
1844
|
"""
|
|
1752
1845
|
|
|
1753
|
-
name = "
|
|
1754
|
-
description = "LINDSEA
|
|
1755
|
-
tags = ["
|
|
1846
|
+
name = "lindsea_pragmatics_scalar_implicatures"
|
|
1847
|
+
description = "LINDSEA scalar implicatures task"
|
|
1848
|
+
tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
|
|
1756
1849
|
|
|
1757
|
-
def __init__(self, language: str):
|
|
1850
|
+
def __init__(self, language: str, subset: str):
|
|
1758
1851
|
super().__init__()
|
|
1759
1852
|
self.language = language
|
|
1760
|
-
self.
|
|
1853
|
+
self.subsets = [subset] if subset != "all" else ["single", "pair"]
|
|
1854
|
+
self.language_to_prompt_components = {
|
|
1761
1855
|
"id": {
|
|
1762
|
-
"
|
|
1763
|
-
"
|
|
1764
|
-
|
|
1765
|
-
|
|
1856
|
+
"text_noun": "Pernyataan",
|
|
1857
|
+
"premise_noun": "Situasi",
|
|
1858
|
+
"conclusion_noun": "Pernyataan",
|
|
1859
|
+
"single_question": "Apakah pernyataan berikut ini {}?",
|
|
1860
|
+
"single_instruction": "Jawablah dengan {} saja.",
|
|
1861
|
+
"pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
|
|
1862
|
+
"pair_instruction": "Jawablah dengan Benar atau Salah saja.",
|
|
1863
|
+
"True": "Benar",
|
|
1864
|
+
"False": "Salah",
|
|
1766
1865
|
},
|
|
1767
1866
|
}
|
|
1867
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1868
|
+
raise Exception(
|
|
1869
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1870
|
+
)
|
|
1871
|
+
else:
|
|
1872
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1768
1873
|
|
|
1769
1874
|
def download_dataset(self, output_path: str):
|
|
1770
1875
|
BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1876
|
+
datasets = []
|
|
1877
|
+
for subset in self.subsets:
|
|
1878
|
+
URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
|
|
1879
|
+
file = f"pragmatic_reasoning_{subset}.jsonl"
|
|
1880
|
+
target_path_file = os.path.join(output_path, file)
|
|
1881
|
+
ensure_file_downloaded(source_url=URL, target_path=target_path_file)
|
|
1882
|
+
data = pd.read_json(target_path_file, lines=True)
|
|
1883
|
+
data["subset"] = subset
|
|
1884
|
+
data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
|
|
1885
|
+
datasets.append(data)
|
|
1886
|
+
dataset = pd.concat(datasets)
|
|
1776
1887
|
return dataset
|
|
1777
1888
|
|
|
1778
1889
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1779
1890
|
data = self.download_dataset(output_path)
|
|
1780
1891
|
outputs = []
|
|
1781
1892
|
for _, row in data.iterrows():
|
|
1782
|
-
passage =
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1893
|
+
passage = None
|
|
1894
|
+
references = []
|
|
1895
|
+
|
|
1896
|
+
if row["subset"] == "single":
|
|
1897
|
+
question = self.prompt_components["single_question"]
|
|
1898
|
+
text_noun = self.prompt_components["text_noun"]
|
|
1899
|
+
instruction = self.prompt_components["single_instruction"]
|
|
1900
|
+
|
|
1901
|
+
passage = "{question}\{text_noun}: {text}\n{instruction}".format(
|
|
1902
|
+
question=question.format(row["question_translated"]),
|
|
1903
|
+
text_noun=text_noun,
|
|
1904
|
+
text=row["text"],
|
|
1905
|
+
instruction=instruction.format(row["choices_translated"]),
|
|
1906
|
+
)
|
|
1907
|
+
# Split "True or False" into ["True", "or", "False"]
|
|
1908
|
+
choices = row["choices"].split()
|
|
1909
|
+
choices_translated = row["choices_translated"].split()
|
|
1910
|
+
label2choice = {
|
|
1911
|
+
choices[0]: choices_translated[0],
|
|
1912
|
+
choices[2]: choices_translated[2],
|
|
1913
|
+
}
|
|
1914
|
+
references.append(
|
|
1915
|
+
Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
|
|
1916
|
+
)
|
|
1917
|
+
|
|
1918
|
+
elif row["subset"] == "pair":
|
|
1919
|
+
premise_noun = self.prompt_components["premise_noun"]
|
|
1920
|
+
question = self.prompt_components["pair_question"]
|
|
1921
|
+
conclusion_noun = self.prompt_components["conclusion_noun"]
|
|
1922
|
+
instruction = self.prompt_components["pair_instruction"]
|
|
1923
|
+
label = self.prompt_components[str(row["label"])]
|
|
1924
|
+
|
|
1925
|
+
passage = (
|
|
1926
|
+
"{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
|
|
1927
|
+
premise_noun=premise_noun,
|
|
1928
|
+
premise=row["text"],
|
|
1929
|
+
question=question,
|
|
1930
|
+
conclusion_noun=conclusion_noun,
|
|
1931
|
+
conclusion=row["conclusion"],
|
|
1932
|
+
instruction=instruction,
|
|
1933
|
+
)
|
|
1934
|
+
)
|
|
1935
|
+
|
|
1936
|
+
references.append(
|
|
1937
|
+
Reference(Output(text=label), tags=[CORRECT_TAG]),
|
|
1938
|
+
)
|
|
1939
|
+
|
|
1940
|
+
input = Input(text=str(passage))
|
|
1792
1941
|
instance = Instance(
|
|
1793
1942
|
input=input,
|
|
1794
1943
|
references=references,
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
6
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class SelfInstructScenario(Scenario):
|