PyPI - crfm-helm - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show

crfm_helm-0.5.10.dist-info/METADATA +369 -0
crfm_helm-0.5.10.dist-info/RECORD +1008 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +80 -29
helm/benchmark/adaptation/adapters/adapter.py +2 -2
helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
helm/benchmark/adaptation/common_adapter_specs.py +443 -0
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/adaptation/request_state.py +6 -1
helm/benchmark/adaptation/scenario_state.py +6 -2
helm/benchmark/annotation/aci_bench_annotator.py +84 -0
helm/benchmark/annotation/air_bench_annotator.py +79 -0
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/annotator.py +48 -0
helm/benchmark/annotation/annotator_factory.py +50 -0
helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/call_center_annotator.py +258 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +96 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/financebench_annotator.py +79 -0
helm/benchmark/annotation/harm_bench_annotator.py +55 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
helm/benchmark/annotation/live_qa_annotator.py +76 -0
helm/benchmark/annotation/med_dialog_annotator.py +88 -0
helm/benchmark/annotation/medalign_annotator.py +89 -0
helm/benchmark/annotation/medi_qa_annotator.py +87 -0
helm/benchmark/annotation/medication_qa_annotator.py +86 -0
helm/benchmark/annotation/mental_health_annotator.py +87 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
helm/benchmark/annotation/model_as_judge.py +309 -0
helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
helm/benchmark/annotation/test_annotator_factory.py +26 -0
helm/benchmark/annotation/test_dummy_annotator.py +44 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation/xstest_annotator.py +100 -0
helm/benchmark/annotation_executor.py +144 -0
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/data_augmenter.py +0 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +3 -3
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +26 -4
helm/benchmark/augmentations/perturbation_description.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +29 -0
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +56 -19
helm/benchmark/augmentations/translate_perturbation.py +31 -0
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/config_registry.py +7 -1
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +54 -25
helm/benchmark/huggingface_registration.py +28 -10
helm/benchmark/metrics/air_bench_metrics.py +3212 -0
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/annotation_metrics.py +108 -0
helm/benchmark/metrics/basic_metrics.py +437 -667
helm/benchmark/metrics/bbq_metrics.py +17 -6
helm/benchmark/metrics/bias_metrics.py +18 -9
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/classification_metrics.py +107 -22
helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/code_metrics_helper.py +11 -3
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +174 -0
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
helm/benchmark/metrics/copyright_metrics.py +5 -5
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
helm/benchmark/metrics/disinformation_metrics.py +8 -114
helm/benchmark/metrics/dry_run_metrics.py +35 -6
helm/benchmark/metrics/efficiency_metrics.py +287 -0
helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
helm/benchmark/metrics/fin_qa_metrics.py +60 -0
helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +67 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
helm/benchmark/metrics/language_modeling_metrics.py +111 -0
helm/benchmark/metrics/live_qa_metrics.py +35 -0
helm/benchmark/metrics/llm_jury_metrics.py +58 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/machine_translation_metrics.py +89 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
helm/benchmark/metrics/medec_metrics.py +124 -0
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/metric.py +121 -175
helm/benchmark/metrics/metric_name.py +0 -1
helm/benchmark/metrics/metric_service.py +23 -7
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/omni_math_metrics.py +44 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
helm/benchmark/metrics/ranking_metrics.py +5 -5
helm/benchmark/metrics/reference_metric.py +148 -0
helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/safety_metrics.py +91 -0
helm/benchmark/metrics/seahelm_metrics.py +201 -0
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +8 -11
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +150 -11
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +145 -70
helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
helm/benchmark/metrics/test_metric.py +3 -3
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
helm/benchmark/metrics/toxicity_metrics.py +37 -7
helm/benchmark/metrics/toxicity_utils.py +23 -0
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/unitxt_metrics.py +107 -0
helm/benchmark/metrics/vision_language/__init__.py +0 -0
helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
helm/benchmark/metrics/vision_language/image_utils.py +100 -0
helm/benchmark/metrics/wildbench_metrics.py +54 -0
helm/benchmark/model_deployment_registry.py +69 -5
helm/benchmark/model_metadata_registry.py +58 -2
helm/benchmark/multi_gpu_runner.py +133 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +51 -20
helm/benchmark/presentation/run_display.py +51 -12
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +83 -66
helm/benchmark/presentation/summarize.py +483 -388
helm/benchmark/presentation/table.py +8 -8
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_contamination.py +2 -2
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/presentation/test_run_entry.py +2 -2
helm/benchmark/presentation/test_schema.py +11 -0
helm/benchmark/presentation/test_summarize.py +148 -6
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +151 -87
helm/benchmark/run_expander.py +418 -33
helm/benchmark/run_spec.py +93 -0
helm/benchmark/run_spec_factory.py +180 -0
helm/benchmark/run_specs/__init__.py +0 -0
helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/call_center_run_specs.py +201 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1393 -0
helm/benchmark/run_specs/cleva_run_specs.py +277 -0
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +224 -0
helm/benchmark/run_specs/finance_run_specs.py +114 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +625 -0
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
helm/benchmark/run_specs/lite_run_specs.py +307 -0
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +188 -0
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +191 -0
helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
helm/benchmark/run_specs/simple_run_specs.py +104 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +63 -62
helm/benchmark/runner_config_registry.py +21 -0
helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
helm/benchmark/scenarios/air_bench_scenario.py +76 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
helm/benchmark/scenarios/banking77_scenario.py +77 -0
helm/benchmark/scenarios/bbq_scenario.py +17 -2
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +18 -3
helm/benchmark/scenarios/boolq_scenario.py +21 -1
helm/benchmark/scenarios/call_center_scenario.py +84 -0
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
helm/benchmark/scenarios/clear_scenario.py +180 -0
helm/benchmark/scenarios/cleva_scenario.py +482 -3
helm/benchmark/scenarios/code_scenario.py +46 -4
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +33 -1
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
helm/benchmark/scenarios/disinformation_scenario.py +32 -1
helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
helm/benchmark/scenarios/ewok_scenario.py +116 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
helm/benchmark/scenarios/financebench_scenario.py +74 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
helm/benchmark/scenarios/gpqa_scenario.py +98 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +21 -2
helm/benchmark/scenarios/gsm_scenario.py +31 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
helm/benchmark/scenarios/headqa_scenario.py +158 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
helm/benchmark/scenarios/ice_scenario.py +28 -4
helm/benchmark/scenarios/ifeval_scenario.py +71 -0
helm/benchmark/scenarios/image_generation/__init__.py +0 -0
helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +26 -3
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
helm/benchmark/scenarios/legal_support_scenario.py +24 -1
helm/benchmark/scenarios/legalbench_scenario.py +45 -3
helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
helm/benchmark/scenarios/lextreme_scenario.py +22 -1
helm/benchmark/scenarios/live_qa_scenario.py +94 -0
helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +81 -22
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +30 -1
helm/benchmark/scenarios/medalign_scenario.py +117 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
helm/benchmark/scenarios/medbullets_scenario.py +167 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
helm/benchmark/scenarios/medec_scenario.py +148 -0
helm/benchmark/scenarios/medhallu_scenario.py +95 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +146 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
helm/benchmark/scenarios/mmlu_scenario.py +32 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +31 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +71 -0
helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
helm/benchmark/scenarios/quac_scenario.py +24 -1
helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
helm/benchmark/scenarios/raft_scenario.py +33 -3
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
helm/benchmark/scenarios/scenario.py +44 -1
helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
helm/benchmark/scenarios/simple_scenarios.py +122 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +109 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
helm/benchmark/scenarios/summarization_scenario.py +48 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
helm/benchmark/scenarios/test_math_scenario.py +4 -3
helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_scenario.py +6 -3
helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/unitxt_scenario.py +62 -0
helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
helm/benchmark/scenarios/vicuna_scenario.py +22 -2
helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
helm/benchmark/scenarios/wikifact_scenario.py +31 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +101 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
helm/benchmark/scenarios/xstest_scenario.py +35 -0
helm/benchmark/server.py +32 -2
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +78 -50
helm/benchmark/static/schema_air_bench.yaml +3149 -0
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +269 -0
helm/benchmark/static/schema_capabilities.yaml +254 -0
helm/benchmark/static/schema_classic.yaml +259 -1140
helm/benchmark/static/schema_cleva.yaml +768 -0
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_decodingtrust.yaml +444 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_ewok.yaml +367 -0
helm/benchmark/static/schema_finance.yaml +191 -0
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_image2struct.yaml +588 -0
helm/benchmark/static/schema_instruction_following.yaml +161 -0
helm/benchmark/static/schema_legal.yaml +566 -0
helm/benchmark/static/schema_lite.yaml +3 -286
helm/benchmark/static/schema_long_context.yaml +282 -0
helm/benchmark/static/schema_medhelm.yaml +1176 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu.yaml +1449 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +283 -0
helm/benchmark/static/schema_seahelm.yaml +723 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/schema_thai.yaml +244 -0
helm/benchmark/static/schema_torr.yaml +474 -0
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_unitxt.yaml +370 -0
helm/benchmark/static/schema_vhelm.yaml +933 -0
helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
helm/benchmark/static_build/config.js +4 -0
helm/benchmark/static_build/index.html +19 -0
helm/benchmark/test_data_preprocessor.py +3 -3
helm/benchmark/test_run_expander.py +1 -1
helm/benchmark/window_services/default_window_service.py +3 -45
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
helm/benchmark/window_services/ice_window_service.py +1 -35
helm/benchmark/window_services/image_generation/__init__.py +0 -0
helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
helm/benchmark/window_services/local_window_service.py +22 -5
helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
helm/benchmark/window_services/test_bloom_window_service.py +5 -4
helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
helm/benchmark/window_services/test_gptj_window_service.py +11 -5
helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
helm/benchmark/window_services/test_openai_window_service.py +18 -12
helm/benchmark/window_services/test_opt_window_service.py +6 -5
helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
helm/benchmark/window_services/test_t511b_window_service.py +5 -4
helm/benchmark/window_services/test_ul2_window_service.py +5 -4
helm/benchmark/window_services/test_utils.py +6 -6
helm/benchmark/window_services/test_yalm_window_service.py +5 -4
helm/benchmark/window_services/tokenizer_service.py +7 -13
helm/benchmark/window_services/window_service.py +42 -0
helm/benchmark/window_services/window_service_factory.py +4 -1
helm/benchmark/window_services/yalm_window_service.py +1 -28
helm/clients/__init__.py +0 -0
helm/{proxy/clients → clients}/ai21_client.py +78 -12
helm/clients/aleph_alpha_client.py +114 -0
helm/{proxy/clients → clients}/anthropic_client.py +304 -21
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +122 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +199 -0
helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
helm/clients/audio_language/qwen_audiolm_client.py +153 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/audio_language/test.py +62 -0
helm/{proxy/clients → clients}/auto_client.py +72 -31
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +381 -0
helm/clients/bedrock_utils.py +105 -0
helm/{proxy/clients → clients}/client.py +92 -17
helm/clients/clip_score_client.py +49 -0
helm/clients/clip_scorers/__init__.py +0 -0
helm/clients/clip_scorers/base_clip_scorer.py +18 -0
helm/clients/clip_scorers/clip_scorer.py +50 -0
helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
helm/{proxy/clients → clients}/cohere_client.py +105 -14
helm/clients/dspy_client.py +135 -0
helm/clients/gcs_client.py +82 -0
helm/{proxy/clients → clients}/google_client.py +8 -6
helm/clients/google_translate_client.py +35 -0
helm/clients/grok_client.py +36 -0
helm/{proxy/clients → clients}/http_model_client.py +8 -8
helm/{proxy/clients → clients}/huggingface_client.py +157 -86
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +269 -0
helm/clients/image_generation/__init__.py +0 -0
helm/clients/image_generation/adobe_vision_client.py +80 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
helm/clients/image_generation/cogview2/__init__.py +0 -0
helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
helm/clients/image_generation/cogview2_client.py +192 -0
helm/clients/image_generation/dalle2_client.py +194 -0
helm/clients/image_generation/dalle3_client.py +108 -0
helm/clients/image_generation/dalle_mini/__init__.py +3 -0
helm/clients/image_generation/dalle_mini/data.py +442 -0
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
helm/clients/image_generation/dalle_mini/model/text.py +251 -0
helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
helm/clients/image_generation/dalle_mini_client.py +191 -0
helm/clients/image_generation/deep_floyd_client.py +80 -0
helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
helm/clients/image_generation/image_generation_client_utils.py +9 -0
helm/clients/image_generation/lexica_client.py +88 -0
helm/clients/image_generation/mindalle/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/__init__.py +216 -0
helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
helm/clients/image_generation/mindalle/utils/config.py +129 -0
helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
helm/clients/image_generation/mindalle/utils/utils.py +89 -0
helm/clients/image_generation/mindalle_client.py +116 -0
helm/clients/image_generation/nudity_check_client.py +64 -0
helm/clients/image_generation/together_image_generation_client.py +113 -0
helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
helm/{proxy/clients → clients}/megatron_client.py +7 -5
helm/clients/mistral_client.py +180 -0
helm/clients/moderation_api_client.py +111 -0
helm/clients/nvidia_nim_client.py +32 -0
helm/clients/open_lm_client.py +43 -0
helm/clients/openai_client.py +604 -0
helm/clients/openai_responses_client.py +200 -0
helm/clients/openrouter_client.py +31 -0
helm/{proxy/clients → clients}/palmyra_client.py +31 -14
helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
helm/clients/reka_client.py +190 -0
helm/clients/simple_client.py +64 -0
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +95 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/{proxy/clients → clients}/test_auto_client.py +13 -15
helm/clients/test_client.py +98 -0
helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
helm/clients/test_openrouter_client.py +69 -0
helm/clients/test_simple_client.py +19 -0
helm/clients/test_together_client.py +184 -0
helm/clients/together_client.py +599 -0
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +488 -0
helm/clients/vision_language/__init__.py +0 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
helm/clients/vision_language/huggingface_vlm_client.py +114 -0
helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
helm/clients/vision_language/open_flamingo/__init__.py +2 -0
helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
helm/clients/vision_language/open_flamingo_client.py +155 -0
helm/clients/vision_language/paligemma_client.py +147 -0
helm/clients/vision_language/palmyra_vision_client.py +101 -0
helm/clients/vision_language/qwen2_vlm_client.py +189 -0
helm/clients/vision_language/qwen_vlm_client.py +174 -0
helm/clients/vllm_client.py +80 -0
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +105 -0
helm/clients/yi_client.py +28 -0
helm/common/audio_utils.py +111 -0
helm/common/cache.py +23 -33
helm/common/cache_backend_config.py +47 -0
helm/common/clip_score_request.py +41 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +10 -2
helm/common/file_caches/__init__.py +0 -0
helm/common/file_caches/file_cache.py +16 -0
helm/common/file_caches/local_file_cache.py +61 -0
helm/common/file_caches/test_local_file_cache.py +25 -0
helm/common/file_upload_request.py +27 -0
helm/common/general.py +10 -3
helm/common/hierarchical_logger.py +124 -12
helm/common/image_generation_parameters.py +25 -0
helm/common/images_utils.py +60 -5
helm/common/key_value_store.py +41 -10
helm/common/local_context.py +140 -0
helm/common/media_object.py +14 -1
helm/common/moderations_api_request.py +71 -0
helm/common/mongo_key_value_store.py +8 -7
helm/common/multimodal_request_utils.py +57 -0
helm/common/nudity_check_request.py +29 -0
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +45 -19
helm/common/response_format.py +18 -0
helm/common/test_cache.py +1 -48
helm/common/test_general.py +10 -0
helm/common/test_logging.py +94 -0
helm/common/test_media_object.py +1 -1
helm/common/tokenization_request.py +1 -10
helm/config/model_deployments.yaml +4713 -1005
helm/config/model_metadata.yaml +4045 -255
helm/config/tokenizer_configs.yaml +1091 -50
helm/proxy/accounts.py +31 -4
helm/proxy/cli.py +6 -4
helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/critique/model_critique_client.py +40 -10
helm/proxy/example_queries.py +33 -28
helm/proxy/retry.py +5 -0
helm/proxy/server.py +82 -18
helm/proxy/services/remote_service.py +32 -7
helm/proxy/services/server_service.py +71 -69
helm/proxy/services/service.py +30 -6
helm/proxy/services/test_remote_service.py +6 -5
helm/proxy/services/test_service.py +1 -13
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +61 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +462 -0
helm/proxy/test_accounts.py +32 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +37 -37
helm/proxy/token_counters/test_auto_token_counter.py +164 -0
helm/proxy/token_counters/token_counter.py +3 -5
helm/tokenizers/__init__.py +0 -0
helm/tokenizers/ai21_tokenizer.py +52 -0
helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
helm/tokenizers/cohere_tokenizer.py +50 -0
helm/tokenizers/grok_tokenizer.py +55 -0
helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/simple_tokenizer.py +33 -0
helm/tokenizers/test_ai21_tokenizer.py +48 -0
helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_cohere_tokenizer.py +39 -0
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
helm/tokenizers/test_simple_tokenizer.py +33 -0
helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
crfm_helm-0.4.0.dist-info/METADATA +0 -264
crfm_helm-0.4.0.dist-info/RECORD +0 -397
helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
helm/benchmark/data_overlap/export_scenario_text.py +0 -119
helm/benchmark/data_overlap/light_scenario.py +0 -60
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/run_specs.py +0 -2762
helm/benchmark/scenarios/numeracy_scenario.py +0 -784
helm/benchmark/static/benchmarking.css +0 -156
helm/benchmark/static/benchmarking.js +0 -1705
helm/benchmark/static/config.js +0 -3
helm/benchmark/static/images/helm-logo.png +0 -0
helm/benchmark/static/images/language-model-helm.png +0 -0
helm/benchmark/static/images/organizations/ai21.png +0 -0
helm/benchmark/static/images/organizations/anthropic.png +0 -0
helm/benchmark/static/images/organizations/bigscience.png +0 -0
helm/benchmark/static/images/organizations/cohere.png +0 -0
helm/benchmark/static/images/organizations/eleutherai.png +0 -0
helm/benchmark/static/images/organizations/google.png +0 -0
helm/benchmark/static/images/organizations/meta.png +0 -0
helm/benchmark/static/images/organizations/microsoft.png +0 -0
helm/benchmark/static/images/organizations/nvidia.png +0 -0
helm/benchmark/static/images/organizations/openai.png +0 -0
helm/benchmark/static/images/organizations/together.png +0 -0
helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
helm/benchmark/static/images/organizations/yandex.png +0 -0
helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
helm/benchmark/static/index.html +0 -68
helm/benchmark/static/json-urls.js +0 -69
helm/benchmark/static/plot-captions.js +0 -27
helm/benchmark/static/utils.js +0 -285
helm/benchmark/test_model_deployment_definition.py +0 -92
helm/benchmark/test_model_properties.py +0 -1570
helm/benchmark/vlm_run_specs.py +0 -97
helm/benchmark/window_services/ai21_window_service.py +0 -258
helm/benchmark/window_services/cohere_window_service.py +0 -163
helm/benchmark/window_services/flan_t5_window_service.py +0 -29
helm/benchmark/window_services/gpt2_window_service.py +0 -32
helm/benchmark/window_services/huggingface_window_service.py +0 -60
helm/benchmark/window_services/t0pp_window_service.py +0 -35
helm/benchmark/window_services/t511b_window_service.py +0 -30
helm/benchmark/window_services/test_ai21_window_service.py +0 -163
helm/benchmark/window_services/test_cohere_window_service.py +0 -74
helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
helm/benchmark/window_services/test_ice_window_service.py +0 -326
helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
helm/benchmark/window_services/ul2_window_service.py +0 -30
helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
helm/common/cache_utils.py +0 -14
helm/proxy/clients/aleph_alpha_client.py +0 -95
helm/proxy/clients/goose_ai_client.py +0 -99
helm/proxy/clients/microsoft_client.py +0 -180
helm/proxy/clients/openai_client.py +0 -206
helm/proxy/clients/simple_client.py +0 -60
helm/proxy/clients/test_client.py +0 -49
helm/proxy/clients/test_together_client.py +0 -97
helm/proxy/clients/together_client.py +0 -334
helm/proxy/clients/vertexai_client.py +0 -115
helm/proxy/token_counters/ai21_token_counter.py +0 -20
helm/proxy/token_counters/cohere_token_counter.py +0 -13
helm/proxy/token_counters/free_token_counter.py +0 -12
helm/proxy/token_counters/gooseai_token_counter.py +0 -24
helm/proxy/token_counters/openai_token_counter.py +0 -22
helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
helm/proxy/token_counters/test_openai_token_counter.py +0 -81
helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
helm/proxy/tokenizers/ice_tokenizer.py +0 -30
helm/proxy/tokenizers/simple_tokenizer.py +0 -32
helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
/helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
/helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
/helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
/helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
/helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
/helm/{proxy/clients → clients}/ai21_utils.py +0 -0
/helm/{proxy/clients → clients}/cohere_utils.py +0 -0
/helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
/helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
/helm/{benchmark → proxy}/static/general.js +0 -0
/helm/{benchmark → proxy}/static/info-icon.png +0 -0

helm/benchmark/scenarios/seahelm_scenario.py ADDED Viewed

@@ -0,0 +1,2295 @@
+import datasets
+import os
+import random
+from typing import List, Dict
+import pandas as pd
+from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Instance,
+    Output,
+    PassageQuestionInput,
+    Reference,
+    Scenario,
+    CORRECT_TAG,
+    TEST_SPLIT,
+    TRAIN_SPLIT,
+    ScenarioMetadata,
+)
+from helm.common.general import ensure_file_downloaded
+from helm.common.hierarchical_logger import hlog
+# SEA-HELM Scenarios
+#   A. Natural Language Understanding
+#   B. Natural Language Generation
+#   C. Natural Language Reasoning
+#   D. Linguistic Diagnostics
+# A. Natural Language Understanding
+#   1. Question Answering
+#   2. Sentiment Analysis
+#   3. Toxicity Detection/Classification
+# 1. Question Answering
+# 1.1 Indonesian: TyDiQA
+class TyDiQAScenario(Scenario):
+    """
+    TyDiQA is is an open-book question answering scenario for 11 typologically-diverse languages.
+    The questions are written by people who want to know the answer, but do not know the answer yet,
+    and the data is collected directly in each language without the use of translation.
+    This scenario only uses the Indonesian subset of the data, and uses the Gold Passage (GoldP) task,
+    which requires the tested system to extract a span from the given passage to answer a given question.
+    There are no unanswerable questions.
+    The models are prompted using the following format:
+        Anda akan diberikan sebuah paragraf dan sebuah pertanyaan. Jawablah pertanyaannya dengan mengekstrak jawaban
+        dari paragraf tersebut.
+        Paragraf: <text>
+        Pertanyaan: <question>
+        Jawaban: <answer>
+        ...
+        Paragraf: <text>
+        Pertanyaan: <question>
+        Jawaban:
+    Target completion:
+        <answer>
+    @article{clark-etal-2020-tydi,
+        title = "{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically
+        Diverse Languages",
+        author = "Clark, Jonathan H.  and
+        Choi, Eunsol  and
+        Collins, Michael  and
+        Garrette, Dan  and
+        Kwiatkowski, Tom  and
+        Nikolaev, Vitaly  and
+        Palomaki, Jennimaria",
+        editor = "Johnson, Mark  and
+        Roark, Brian  and
+        Nenkova, Ani",
+        journal = "Transactions of the Association for Computational Linguistics",
+        volume = "8",
+        year = "2020",
+        address = "Cambridge, MA",
+        publisher = "MIT Press",
+        url = "https://aclanthology.org/2020.tacl-1.30",
+        doi = "10.1162/tacl_a_00317",
+        pages = "454--470",
+    }
+    """
+    name = "tydiqa"
+    description = "Indonesian Open-book Question Answering task"
+    tags = ["question_answering"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = datasets.load_dataset(
+            "khalidalt/tydiqa-goldp",
+            "indonesian",
+            revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
+            trust_remote_code=True,
+        )
+        outputs = []
+        for split in self.splits.keys():
+            df = dataset[split].to_pandas()
+            if split == "train":
+                # Select only bottom 20th percentile by length for in-context examples as examples are very long
+                data = df[df["passage_text"].apply(len) < df["passage_text"].apply(len).quantile(0.2)]
+            else:
+                data = df
+            for _, row in data.iterrows():
+                passage = row["passage_text"].strip()
+                question = row["question_text"].strip()
+                input = PassageQuestionInput(
+                    passage=passage,
+                    question=question,
+                    passage_prefix="Paragraf: ",
+                    question_prefix="Pertanyaan: ",
+                )
+                references = []
+                for answer in row["answers"]["text"]:
+                    output = Output(text=answer.strip())
+                    references.append(Reference(output, tags=[CORRECT_TAG]))
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="tydiqa",
+            display_name="TyDiQA",
+            short_display_name=None,
+            description="TyDiQA [(Clark, 2020)](https://aclanthology.org/2020.tacl-1.30) is an "
+            "open-book question answering dataset for 11 typologically-diverse languages. "
+            "The questions are written by people who want to know the answer, but do not "
+            "know the answer yet, and the data is collected directly in each language "
+            "without the use of translation.\n",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="questions by human annotators about Wikipedia articles",
+                when="?",
+                who="human annotators",
+                language="Indonesian",
+            ),
+            main_metric="squad_f1_score",
+            main_split="test",
+        )
+# 1.2 Vietnamese & Thai: XQuAD
+class XQuADScenario(Scenario):
+    """
+    XQuAD is an open-book question answering scenario that is parallel across 10 languages.
+    The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the
+    development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
+    This scenario only uses the Vietnamese and Thai subsets of the data and there are no
+    unanswerable questions.
+    The models are prompted using the following general format:
+        You will be given a paragraph and a question. Answer the question by extracting the answer from the paragraph.
+        Paragraph: <text>
+        Question: <question>
+        Answer: <answer>
+        ...
+        Paragraph: <text>
+        Question: <question>
+        Answer:
+    Target completion:
+        <answer>
+    @article{Artetxe:etal:2019,
+      author    = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
+      title     = {On the cross-lingual transferability of monolingual representations},
+      journal   = {CoRR},
+      volume    = {abs/1910.11856},
+      year      = {2019},
+      archivePrefix = {arXiv},
+      eprint    = {1910.11856}
+    }
+    """
+    name = "xquad"
+    description = "Vietnamese and Thai Open-book Question Answering task"
+    tags = ["question_answering"]
+    def __init__(self, language: str):
+        super().__init__()
+        self.language = language
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+        self.language_to_prompt_components = {
+            "th": {
+                "passage_prefix": "ข้อความ: ",
+                "question_prefix": "คำถาม: ",
+                "random_state": 4520,
+            },
+            "vi": {
+                "passage_prefix": "Đoạn văn: ",
+                "question_prefix": "Câu hỏi: ",
+                "random_state": 4502,
+            },
+        }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
+        df = dataset.to_pandas()
+        # Sample 1000 examples for test
+        df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
+        # In-context examples to be drawn from remaining examples (since there is no train data)
+        df_train = df[~df.index.isin(df_test.index)]
+        # Select only bottom 20th percentile by length for in-context examples as examples are very long
+        df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
+        dataset = {
+            "train": df_train,
+            "test": df_test,
+        }
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                passage = row["context"].strip()
+                question = row["question"].strip()
+                input = PassageQuestionInput(
+                    passage=passage,
+                    question=question,
+                    passage_prefix=str(self.prompt_components["passage_prefix"]),
+                    question_prefix=str(self.prompt_components["question_prefix"]),
+                )
+                references = []
+                for answer in row["answers"]["text"]:
+                    output = Output(text=answer.strip())
+                    references.append(Reference(output, tags=[CORRECT_TAG]))
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"xquad_{self.language}",
+            display_name=f"XQuAD ({self.language})",
+            short_display_name=None,
+            description="XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book "
+            "question answering dataset that is parallel across 10 languages. The dataset "
+            "consists of a subset of 240 paragraphs and 1190 question-answer pairs from the "
+            "development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their "
+            "professional translations.\n",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="questions by crowdworkers about Wikipedia articles translated "
+                f"from English to {self.language}",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="squad_f1_score",
+            main_split="test",
+        )
+# 1.3 Tamil: IndicQA
+class IndicQAScenario(Scenario):
+    """
+    IndicQA is an open-book question answering scenario for 11 Indic languages.
+    Answers to questions are to be extracted from the text provided. The data is taken from
+    Wikipedia articles across various domains and questions and answers were manually created
+    by native speakers.
+    This scenario only uses the Tamil subset of the data and unanswerable questions
+    are removed from the dataset in order to be consistent with the question answering
+    scenarios for Indonesian, Vietnamese and Thai.
+    The models are prompted using the following format:
+        உங்களுக்கு ஒரு பத்தியும் ஒரு கேள்வியும் தரப்படும். தரப்பட்ட பத்தியிலிருந்து கேள்விக்கான பதிலைக் கண்டறியவும்.
+        பத்தி: <text>
+        கேள்வி: <question>
+        பதில்: <answer>
+        ...
+        பத்தி: <text>
+        கேள்வி: <question>
+        பதில்:
+    Target completion:
+        <answer>
+    @inproceedings{doddapaneni-etal-2023-towards,
+        title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
+            {I}ndic Languages",
+        author = "Doddapaneni, Sumanth  and
+            Aralikatte, Rahul  and
+            Ramesh, Gowtham  and
+            Goyal, Shreya  and
+            Khapra, Mitesh M.  and
+            Kunchukuttan, Anoop  and
+            Kumar, Pratyush",
+        editor = "Rogers, Anna  and
+            Boyd-Graber, Jordan  and
+            Okazaki, Naoaki",
+        booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
+            Long Papers)",
+        month = jul,
+        year = "2023",
+        address = "Toronto, Canada",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2023.acl-long.693",
+        doi = "10.18653/v1/2023.acl-long.693",
+        pages = "12402--12426",
+    }
+    """
+    name = "indicqa"
+    description = "Tamil Open-book Question Answering task"
+    tags = ["question_answering"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = datasets.load_dataset(
+            "ai4bharat/IndicQA",
+            "indicqa.ta",
+            split="test",
+            revision="78ee8d58e880c72f324e176c989dfefa55427af4",
+            trust_remote_code=True,
+        )
+        df = dataset.to_pandas()
+        # Remove unanswerable questions (answer is an empty string)
+        df = df[df["answers"].apply(lambda x: len(x["text"][0].strip()) > 0)]
+        # Sample 1000 examples for test
+        df_test = df.sample(n=1000, random_state=7900)
+        # In-context examples to be drawn from remaining examples (since there is no train/dev data)
+        df_train = df[~df.index.isin(df_test.index)]
+        # Select only bottom 20th percentile by length for in-context examples as examples are very long
+        df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
+        dataset = {
+            "train": df_train,
+            "test": df_test,
+        }
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                passage = row["context"].strip()
+                question = row["question"].strip()
+                input = PassageQuestionInput(
+                    passage=passage,
+                    question=question,
+                    passage_prefix="பத்தி: ",
+                    question_prefix="கேள்வி: ",
+                )
+                references = []
+                for answer in row["answers"]["text"]:
+                    output = Output(text=answer.strip())
+                    references.append(Reference(output, tags=[CORRECT_TAG]))
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indicqa",
+            display_name="IndicQA",
+            short_display_name=None,
+            description="IndicQA [(Doddapaneni, 2023)](https://aclanthology.org/2023.acl-long.693)is an "
+            "open-book question answering dataset for 11 Indic languages. Answers to "
+            "questions are to be extracted from the text provided. The data is taken from "
+            "Wikipedia articles across various domains and questions and answers were "
+            "manually created by native speakers.\n",
+            taxonomy=TaxonomyInfo(
+                task="question answering",
+                what="questions about Wikipedia articles translated by native " "speakers from English to Tamil",
+                when="?",
+                who="?",
+                language="Tamil",
+            ),
+            main_metric="squad_f1_score",
+            main_split="test",
+        )
+# 2. Sentiment Analysis
+# 2.1 Indonesian: NusaX Sentiment
+class NusaXScenario(Scenario):
+    """
+    NusaX is a sentiment analysis scenario for 11 Indonesian languages.
+    The data is derived from a subset of SmSA (Purwarianti and Crisdayanti, 2019) and manually translated
+    from Indonesian to 10 other local languages, such as Acehnese and Toba Batak.
+    It consists of comments and reviews from various online platforms.
+    Only the Indonesian subset of the data is used for this scenario, and the labels are
+    positive, negative or neutral.
+    The models are prompted using the following format:
+        Apa sentimen dari kalimat berikut ini?
+        Jawablah dengan satu kata saja:
+        - Positif
+        - Negatif
+        - Netral
+        Kalimat: <text>
+        Jawaban: <sentiment>
+        ...
+        Kalimat: <text>
+        Jawaban:
+    Target completion:
+        <sentiment>
+    @inproceedings{winata-etal-2023-nusax,
+        title = "{N}usa{X}: Multilingual Parallel Sentiment Dataset for 10 {I}ndonesian Local Languages",
+        author = "Winata, Genta Indra  and
+        Aji, Alham Fikri  and
+        Cahyawijaya, Samuel  and
+        Mahendra, Rahmad  and
+        Koto, Fajri  and
+        Romadhony, Ade  and
+        Kurniawan, Kemal  and
+        Moeljadi, David  and
+        Prasojo, Radityo Eko  and
+        Fung, Pascale  and
+        Baldwin, Timothy  and
+        Lau, Jey Han  and
+        Sennrich, Rico  and
+        Ruder, Sebastian",
+        editor = "Vlachos, Andreas  and
+        Augenstein, Isabelle",
+        booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for
+            Computational Linguistics",
+        month = may,
+        year = "2023",
+        address = "Dubrovnik, Croatia",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2023.eacl-main.57",
+        doi = "10.18653/v1/2023.eacl-main.57",
+        pages = "815--834",
+    }
+    """
+    name = "nusax"
+    description = "Indonesian NusaX-Senti Sentiment Analysis dataset"
+    tags = ["sentiment_analysis"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+        self.sentiment2label = {
+            "positive": "Positif",
+            "negative": "Negatif",
+            "neutral": "Netral",
+        }
+    def download_dataset(self, output_path: str):
+        URLS = {
+            "test": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/test.csv",
+            "train": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/train.csv",
+        }
+        dataset: Dict[str, pd.DataFrame] = {}
+        for split in self.splits.keys():
+            target_path_file = os.path.join(output_path, split)
+            ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
+            data = pd.read_csv(target_path_file)
+            dataset[split] = data
+        return dataset
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = self.download_dataset(output_path)
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                input = Input(row["text"].strip())
+                output = Output(text=self.sentiment2label[row["label"]])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="nusax",
+            display_name="NusaX",
+            short_display_name=None,
+            description="NusaX [(Winata, 2023)](https://aclanthology.org/2023.eacl-main.57) is an "
+            "Indonesian sentiment analysis dataset. The data consists of comments and "
+            "reviews from various online platforms.\n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="online comments and reviews",
+                when="?",
+                who="internet users",
+                language="Indonesian",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
+# 2.2 Vietnamese: UIT-VSFC
+class UITVSFCScenario(Scenario):
+    """
+    UIT-VSFC is a Vietnamese sentiment analysis scenario. The data consists of student feedback obtained from
+    end-of-semester surveys at a Vietnamese university. Feedback is labeled as one of three sentiment
+    polarities: positive, negative or neutral.
+    The models are prompted using the following format:
+        Sắc thái của câu sau đây là gì?
+        Trả lời với một từ duy nhất:
+        - Tích cực
+        - Tiêu cực
+        - Trung lập
+        Câu văn: <text>
+        Câu trả lời: <sentiment>
+        ...
+        Câu văn: <text>
+        Câu trả lời:
+    Target completion:
+        <sentiment>
+    @inproceedings{van2018uit,
+        title={UIT-VSFC: Vietnamese students’ feedback corpus for sentiment analysis},
+        author={Van Nguyen, Kiet and Nguyen, Vu Duc and Nguyen, Phu XV and Truong, Tham TH and Nguyen, Ngan Luu-Thuy},
+        booktitle={2018 10th international conference on knowledge and systems engineering (KSE)},
+        pages={19--24},
+        year={2018},
+        organization={IEEE},
+        url={https://ieeexplore.ieee.org/document/8573337},
+    }
+    """
+    name = "uitvsfc"
+    description = "Vietnamese Students' Feedback Corpus sentiment analysis task"
+    tags = ["sentiment_analysis"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+        self.id2label = {
+            0: "Tiêu cực",
+            1: "Trung lập",
+            2: "Tích cực",
+        }
+    def download_dataset(self, output_path: str):
+        URLS = {
+            "train": {
+                "sentences": "https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download",
+                "sentiments": "https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download",
+            },
+            "test": {
+                "sentences": "https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download",
+                "sentiments": "https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download",
+            },
+        }
+        dataset: Dict[str, pd.DataFrame] = {}
+        for split in list(URLS.keys()):
+            file_lines: Dict[str, List[str]] = {}
+            for file in list(URLS[split].keys()):
+                file_lines[file] = []
+                target_path_file = os.path.join(output_path, split, file)
+                ensure_file_downloaded(source_url=URLS[split][file], target_path=target_path_file)
+                with open(target_path_file, "r") as f:
+                    lines = f.readlines()
+                    for line in lines:
+                        file_lines[file].append(str(line).strip())
+            df = pd.DataFrame({"text": file_lines["sentences"], "label": file_lines["sentiments"]})
+            if split == "test":
+                dataset[split] = df.groupby("label", group_keys=False).apply(
+                    lambda x: x.sample(frac=1000 / len(df), random_state=4156)
+                )
+            else:
+                dataset[split] = df
+        return dataset
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = self.download_dataset(output_path)
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                input = Input(row["text"])
+                output = Output(text=self.id2label[int(row["label"])])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="uitvsfc",
+            display_name="UIT-VSFC",
+            short_display_name=None,
+            description="UIT-VSFC [(Nguyen, 2018)](https://ieeexplore.ieee.org/document/8573337) is a "
+            "Vietnamese sentiment analysis dataset. The data consists of student feedback "
+            "obtained from end-of-semester surveys at a Vietnamese university.\n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="university student end-of-semester survey responses",
+                when="?",
+                who="university students",
+                language="Vietnamese",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
+# 2.3 Thai: Wisesight Sentiment
+class WisesightScenario(Scenario):
+    """
+    Wisesight Sentiment is a Thai sentiment analysis scenario. The data consists of social media messages
+    regarding consumer products and services.
+    The dataset originally included the label "question" for instances that were questions. These instances
+    made up only a small subset of the data and were dropped in order to make the task more consistent
+    with those of other languages. Labels are therefore only positive, negative or neutral.
+    The models are prompted using the following format:
+        อารมณ์ความรู้สึกของข้อความต่อไปนี้เป็นอย่างไร?
+        กรุณาตอบโดยใช้คำเดียวเท่านั้น:
+        - แง่บวก
+        - แง่ลบ
+        - เฉยๆ
+        ข้อความ: <text>
+        คำตอบ: <sentiment>
+        ...
+        ข้อความ: <text>
+        คำตอบ:
+    Target completion:
+        <sentiment>
+    @software{bact_2019_3457447,
+        author       = {Suriyawongkul, Arthit and
+                        Chuangsuwanich, Ekapol and
+                        Chormai, Pattarawat and
+                        Polpanumas, Charin},
+        title        = {PyThaiNLP/wisesight-sentiment: First release},
+        month        = sep,
+        year         = 2019,
+        publisher    = {Zenodo},
+        version      = {v1.0},
+        doi          = {10.5281/zenodo.3457447},
+        url          = {https://doi.org/10.5281/zenodo.3457447}
+    }
+    """
+    name = "wisesight"
+    description = "Wisesight Sentiment Thai sentiment analysis task"
+    tags = ["sentiment_analysis"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+        self.sentiment2label = {
+            "pos": "แง่บวก",
+            "neg": "แง่ลบ",
+            "neu": "เฉยๆ",
+        }
+    def download_dataset(self, output_path: str):
+        URL = "https://github.com/PyThaiNLP/wisesight-sentiment/raw/master/huggingface/data.zip"
+        data_path = os.path.join(output_path, "data")
+        ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
+        dataset: Dict[str, pd.DataFrame] = {}
+        for split in self.splits.keys():
+            target_path_file = os.path.join(data_path, "data", f"{split}.jsonl")
+            df = pd.read_json(target_path_file, lines=True)
+            df = df[df["category"] != "q"]  # Drop instances with the "question" label
+            if split == "test":
+                dataset[split] = df.groupby("category", group_keys=False).apply(
+                    lambda x: x.sample(frac=1000 / len(df), random_state=4183)
+                )
+            else:
+                dataset[split] = df
+        return dataset
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = self.download_dataset(output_path)
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                input = Input(row["texts"].strip())
+                output = Output(text=self.sentiment2label[row["category"]])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="wisesight",
+            display_name="Wisesight",
+            short_display_name=None,
+            description="Wisesight [(Suriyawongkul, 2019)](https://doi.org/10.5281/zenodo.3457447) is "
+            "an Thai sentiment analysis scenario. The data consists of social media "
+            "messages regarding consumer products and services. \n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis",
+                what="social media messages regarding consumer products and services",
+                when="?",
+                who="social media users",
+                language="Thai",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
+# 2.4 Tamil: IndicSentiment
+class IndicSentimentScenario(Scenario):
+    """
+    IndicSentiment is a sentiment analysis scenario for 10 Indic languages. The data consists of
+    product reviews written in English that were then translated by native speakers of the
+    respective languages, resulting in a parallel dataset across the 10 languages.
+    Only the Tamil subset of the dataset is used for this scenario. Labels are positive or negative.
+    The models are prompted using the following format:
+        பின்வரும் வாக்கியத்தில் வெளிப்படுத்தப்படும் உணர்வு எது?
+        ஒரு சொல்லில் மட்டும் பதிலளிக்கவும்:
+        - நேர்மறை
+        - எதிர்மறை
+        வாக்கியம்: <text>
+        பதில்:
+        ...
+        வாக்கியம்: <text>
+        பதில்: <answer>
+    Target completion:
+        <sentiment> (<sentiment>:positive or negative)
+    @inproceedings{doddapaneni-etal-2023-towards,
+        title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
+            {I}ndic Languages",
+        author = "Doddapaneni, Sumanth  and
+            Aralikatte, Rahul  and
+            Ramesh, Gowtham  and
+            Goyal, Shreya  and
+            Khapra, Mitesh M.  and
+            Kunchukuttan, Anoop  and
+            Kumar, Pratyush",
+        editor = "Rogers, Anna  and
+            Boyd-Graber, Jordan  and
+            Okazaki, Naoaki",
+        booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
+            Long Papers)",
+        month = jul,
+        year = "2023",
+        address = "Toronto, Canada",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2023.acl-long.693",
+        doi = "10.18653/v1/2023.acl-long.693",
+        pages = "12402--12426",
+    }
+    """
+    name = "indicsentiment"
+    description = "IndicSentiment Tamil sentiment analysis task"
+    tags = ["sentiment_analysis"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
+        self.sentiment2label = {
+            "Positive": "நேர்மறை",
+            "Negative": "எதிர்மறை",
+        }
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = datasets.load_dataset(
+            "ai4bharat/IndicSentiment",
+            "translation-ta",
+            revision="dc8f3f66886531c6897fedffca1e938a68fc5013",
+            trust_remote_code=True,
+        )
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split].to_pandas()
+            # Current version on HuggingFace datasets has 2 instances without labels across all languages.
+            # Confirmed with first author that the labels for these instances should be Positive.
+            data["LABEL"] = data["LABEL"].fillna("Positive")
+            for _, row in data.iterrows():
+                input = Input(row["INDIC REVIEW"].strip())
+                output = Output(text=self.sentiment2label[row["LABEL"]])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indicsentiment",
+            display_name="IndicSentiment",
+            short_display_name=None,
+            description="IndicSentiment is a Tamil sentiment analysis dataset that comes from "
+            "IndicXTREME [(Doddapaneni, "
+            "2022)](https://aclanthology.org/2023.acl-long.693/), and consists of product "
+            "reviews that were written by annotators. Labels are positive or negative.\n",
+            taxonomy=TaxonomyInfo(
+                task="sentiment analysis", what="product reviews", when="?", who="human annotators", language="Tamil"
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
+# 3. Toxicity Detection/Classification
+# 3.1 Indonesian: Multi-Label Hate Speech Detection
+class MLHSDScenario(Scenario):
+    """
+    Multi-Label Hate Speech and Abusive Language Detection (MLHSD) is an Indonesian toxicity
+    classification scenario.  The data is obtained from Twitter and PII have been anonymized to
+    USER and URL.
+    The original dataset was used for a multi-label classification task, but it has been repurposed
+    as a multi-class classification task to be more aligned with the task for other languages.
+    The mapping is done as follows:
+    - Clean: No abusive language or hate speech labels
+    - Abusive: Only abusive language label but no hate speech labels
+    - Hate: As long as one hate speech label is present
+    The models are prompted using the following format:
+        Anda adalah pendeteksi ujaran kebencian. Definisi dari labelnya adalah sebagai berikut:
+        Bersih: Tidak ada ujaran kebencian.
+        Kasar: Ada ujaran kebencian dan kata-kata kasar, namun tidak menyerang pihak tertentu.
+        Benci: Ada ujaran kebencian atau serangan langsung terhadap pihak tertentu.
+        Berdasarkan definisi labelnya, klasifikasikan kalimat berikut ini dengan satu kata saja:
+        - Bersih
+        - Kasar
+        - Benci
+        Kalimat: <text>
+        Jawaban: <answer>
+        ...
+        Kalimat: <text>
+        Jawaban:
+    Target completion:
+        <answer>
+    @inproceedings{ibrohim-budi-2019-multi,
+        title = "Multi-label Hate Speech and Abusive Language Detection in {I}ndonesian {T}witter",
+        author = "Ibrohim, Muhammad Okky  and
+            Budi, Indra",
+        editor = "Roberts, Sarah T.  and
+            Tetreault, Joel  and
+            Prabhakaran, Vinodkumar  and
+            Waseem, Zeerak",
+        booktitle = "Proceedings of the Third Workshop on Abusive Language Online",
+        month = aug,
+        year = "2019",
+        address = "Florence, Italy",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/W19-3506",
+        doi = "10.18653/v1/W19-3506",
+        pages = "46--57",
+    }
+    """
+    name = "mlhsd"
+    description = (
+        "Multi-Label Hate Speech and Abusive Language Detection (MLHSD) Indonesian toxicity classification task"
+    )
+    tags = ["toxicity_detection"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+    def download_dataset(self, output_path: str):
+        BASE_URL = "https://raw.githubusercontent.com/okkyibrohim/"
+        URL = f"{BASE_URL}id-multi-label-hate-speech-and-abusive-language-detection/master/re_dataset.csv"
+        target_path_file = os.path.join(output_path, "mlhsd")
+        ensure_file_downloaded(source_url=URL, target_path=target_path_file)
+        df = pd.read_csv(target_path_file, encoding="ISO-8859-1")
+        # Map multi-label task to multi-class task
+        df["label"] = df.apply(lambda x: self.get_label(x), axis=1)
+        df_test = df.groupby("label", group_keys=False).apply(
+            lambda x: x.sample(frac=1000 / len(df), random_state=7123)
+        )
+        # In-context examples to be drawn from remaining examples (since there is no train/dev data)
+        df_train = df[~df.index.isin(df_test.index)]
+        dataset = {
+            "train": df_train,
+            "test": df_test,
+        }
+        return dataset
+    def get_label(self, row) -> str:
+        if int(row["HS"]) == 1:
+            return "Benci"
+        elif int(row["Abusive"]) == 1:
+            return "Kasar"
+        else:
+            return "Bersih"
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = self.download_dataset(output_path)
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                input = Input(row["Tweet"].strip())
+                output = Output(text=row["label"])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="mlhsd",
+            display_name="MLHSD",
+            short_display_name=None,
+            description="MLHSD [(Ibrohim, 2019)](https://aclanthology.org/W19-3506) is an Indonesian "
+            "toxicity detection dataset obtained from tweets on Twitter.\n",
+            taxonomy=TaxonomyInfo(
+                task="toxicity detection/classification",
+                what="tweets",
+                when="?",
+                who="Twitter users",
+                language="Indonesian",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
+# 3.2 Vietnamese: ViHSD
+class ViHSDScenario(Scenario):
+    """
+    ViHSD is a Vietnamese toxicity classification scenario. The data is obtained from social media.
+    The labels are Clean, Offensive and Hate.
+    The models are prompted using the following format:
+        Bạn là máy phát hiện phát ngôn thù ghét. Các nhãn được định nghĩa như sau:
+        Sạch: Không quấy rối.
+        Công kích: Bao gồm quấy rối và thậm chí chửi thề, nhưng không tấn công bất kì đối tượng cụ thể nào.
+        Thù ghét: Trực tiếp quấy rối hay lăng mạ một đối tượng cụ thể.
+        Với các định nghĩa của nhãn, hãy phân loại câu dưới đây với một từ duy nhất:
+        - Sạch
+        - Công kích
+        - Thù ghét
+        Câu văn: <text>
+        Câu trả lời: <toxicity>
+        ...
+        Câu văn: <text>
+        Câu trả lời:
+    Target completion:
+        <toxicity>
+    @InProceedings{10.1007/978-3-030-79457-6_35,
+        author="Luu, Son T.
+            and Nguyen, Kiet Van
+            and Nguyen, Ngan Luu-Thuy",
+        editor="Fujita, Hamido
+            and Selamat, Ali
+            and Lin, Jerry Chun-Wei
+            and Ali, Moonis",
+        title="A Large-Scale Dataset for Hate Speech Detection on Vietnamese Social Media Texts",
+        booktitle="Advances and Trends in Artificial Intelligence. Artificial Intelligence Practices",
+        year="2021",
+        publisher="Springer International Publishing",
+        address="Cham",
+        pages="415--426",
+        isbn="978-3-030-79457-6",
+        url="https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35",
+    }
+    """
+    name = "vihsd"
+    description = "ViHSD Vietnamese toxicity classification task"
+    tags = ["toxicity_detection"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+        self.id2label = {
+            0: "Sạch",
+            1: "Công kích",
+            2: "Thù ghét",
+        }
+    def download_dataset(self, output_path: str):
+        URL = "https://raw.githubusercontent.com/sonlam1102/vihsd/main/data/vihsd.zip"
+        data_path = os.path.join(output_path, "data")
+        ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
+        dataset: Dict[str, pd.DataFrame] = {}
+        for split in self.splits.keys():
+            target_path_file = os.path.join(data_path, "vihsd", f"{split}.csv")
+            df = pd.read_csv(target_path_file)
+            data = df.groupby("label_id", group_keys=False).apply(
+                lambda x: x.sample(frac=1000 / len(df), random_state=4878)
+            )
+            dataset[split] = data
+        return dataset
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = self.download_dataset(output_path)
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                input = Input(str(row["free_text"]).strip())
+                output = Output(text=self.id2label[int(row["label_id"])])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="vihsd",
+            display_name="ViHSD",
+            short_display_name=None,
+            description="ViHSD [(Luu, "
+            "2021)](https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35 )is a "
+            "Vietnamese toxicity detection dataset obtained from comments on Facebook, "
+            "Youtube, Instagram, and Tiktok.\n",
+            taxonomy=TaxonomyInfo(
+                task="toxicity detection/classification",
+                what="social media comments",
+                when="?",
+                who="Social media users",
+                language="Vietnamese",
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
+# 3.3 Thai: Thai Toxicity Tweets
+class ThaiToxicityTweetsScenario(Scenario):
+    """
+    Thai Toxicity Tweets is a Thai toxicity detection scenario. The data is obtained from Twitter.
+    Instances with no labels or had "TWEET_NOT_FOUND" as the text were dropped from the dataset.
+    The labels are either Y (the text is toxic) or N (the text is clean).
+    The models are prompted using the following format:
+        คุณเป็นเครื่องตรวจจับข้อความที่เป็นพิษ
+        ข้อความเป็นพิษถ้ามันทำร้าย สร้างความเสียหาย หรือสื่อเจตนาเชิงลบถึงบุคคลหรือกลุ่มบุคคล
+        และคุณควรคำนึงถึงการประชดประชันด้วย
+        เมื่อได้รับข้อความ ให้ตอบด้วยตัวอักษร Y หากเป็นพิษ และ N หากไม่เป็นพิษ
+        ข้อความ: <text>
+        คำตอบ: <toxicity>
+        ...
+        ข้อความ: <text>
+        คำตอบ:
+    Target completion:
+        <toxicity>
+    @inproceedings{sirihattasak2018annotation,
+        title={Annotation and classification of toxicity for Thai Twitter},
+        author={Sirihattasak, Sugan and Komachi, Mamoru and Ishikawa, Hiroshi},
+        booktitle={TA-COS 2018: 2nd Workshop on Text Analytics for Cybersecurity and Online Safety},
+        pages={1},
+        year={2018},
+        url={http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf},
+    }
+    """
+    name = "thaitoxicitytweets"
+    description = "Thai Toxicity Tweets toxicity detection task"
+    tags = ["toxicity_detection"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
+        self.id2label = {
+            0: "N",
+            1: "Y",
+        }
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = datasets.load_dataset(
+            "tmu-nlp/thai_toxicity_tweet",
+            split="train",
+            revision="aa021e41d0ee6dbee2975fbed620ec8c586bdaf6",
+            trust_remote_code=True,
+        )
+        df = dataset.to_pandas()
+        # Drop instances where there are no labels or text is "TWEET_NOT_FOUND"
+        df = df[df["tweet_text"].str.len() > 0]
+        df = df[df["tweet_text"] != "TWEET_NOT_FOUND"]
+        df_test = df.groupby("is_toxic", group_keys=False).apply(
+            lambda x: x.sample(frac=1000 / len(df), random_state=4156)
+        )
+        # In-context examples to be drawn from remaining examples (since there is no train/dev data)
+        df_train = df[~df.index.isin(df_test.index)]
+        dataset = {
+            "train": df_train,
+            "test": df_test,
+        }
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                input = Input(row["tweet_text"].strip())
+                output = Output(text=self.id2label[int(row["is_toxic"])])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="thaitoxicitytweets",
+            display_name="Thai Toxicity Tweets",
+            short_display_name=None,
+            description="Thai Toxicity Tweets [(Sirihattasak, "
+            "2018)](http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf) is a "
+            "Thai toxicity detection dataset obtained from tweets on Twitter. \n",
+            taxonomy=TaxonomyInfo(
+                task="toxicity detection/classification", what="tweets", when="", who="Twitter users", language="Thai"
+            ),
+            main_metric="classification_macro_f1",
+            main_split="test",
+        )
+# B. Natural Language Generation
+#   1. Machine Translation
+# 1. Machine Translation: FLoRes-200
+class FloresScenario(Scenario):
+    """
+    FLoRes-200 is a machine translation scenario for 200+ languages. The data is obtained from English Wikimedia
+    projects (Wikivoyage, Wikijunior and Wikinews), and professionally translated across 200+ languages to obtain a
+    parallel dataset.
+    Only the English, Indonesian, Vietnamese, Thai and Tamil subsets are used in this scenario. Both directions
+    (in and out of English) for each Southeast Asian language are included in the scenario.
+    The models are prompted using the following general format:
+        Translate the following text into <language> language.
+        Text: <text>
+        Translation: <translation>
+        ...
+        Text: <text>
+        Translation:
+    Target completion:
+        <translation>
+    @article{nllb2022,
+        author = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield,
+            Kevin Heffernan, Elahe Kalbassi,  Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang,
+            Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti,
+            John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran,
+            Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao,
+            Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
+            Safiyyah Saleem, Holger Schwenk, Jeff Wang
+        },
+        title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
+        year = {2022},
+        url = {https://research.facebook.com/publications/no-language-left-behind/},
+    }
+    """
+    name = "flores"
+    description = "FLoRes-200 machine translation task"
+    tags = ["machine_translation"]
+    def __init__(self, pair: str):
+        super().__init__()
+        self.pair = pair
+        self.source = pair.split("_")[0]
+        self.target = pair.split("_")[1]
+        self.splits = {"dev": TRAIN_SPLIT, "devtest": TEST_SPLIT}
+        self.languages = {
+            "en": "eng_Latn",
+            "id": "ind_Latn",
+            "vi": "vie_Latn",
+            "th": "tha_Thai",
+            "ta": "tam_Taml",
+        }
+        if self.source not in self.languages.keys() or self.target not in self.languages.keys():
+            raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
+    def get_instances(self, output_path) -> List[Instance]:
+        source_dataset = datasets.load_dataset(
+            "facebook/flores",
+            self.languages[self.source],
+            revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
+            trust_remote_code=True,
+        )
+        target_dataset = datasets.load_dataset(
+            "facebook/flores",
+            self.languages[self.target],
+            revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
+            trust_remote_code=True,
+        )
+        outputs = []
+        for split in self.splits.keys():
+            source_df = source_dataset[split].to_pandas()
+            target_df = target_dataset[split].to_pandas()
+            data = source_df.join(target_df, lsuffix="_source", rsuffix="_target")
+            for _, row in data.iterrows():
+                input = Input(row["sentence_source"].strip())
+                output = Output(row["sentence_target"].strip())
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"flores_{self.source}_{self.target}",
+            display_name=f"Flores ({self.source} to {self.target})",
+            short_display_name=None,
+            description="Flores [(NLLB Team, "
+            "2022)](https://research.facebook.com/publications/no-language-left-behind/) "
+            "was created with professional human translators who translate the FLORES "
+            "source dataset into the target languages and a separate group of independent "
+            "translation reviewers who perform quality assessments of the human "
+            "translations and provide translation feedback to the translators.\n",
+            taxonomy=TaxonomyInfo(
+                task="machine translation",
+                what="translations from professional human translators",
+                when="?",
+                who="professional human translators",
+                language=f"{self.source}, {self.target}",
+            ),
+            main_metric="chr_f_plus_plus",
+            main_split="test",
+        )
+# C. Natural Language Reasoning
+#   1. Natural Language Inference
+#   2. Causal Reasoning
+# 1. Natural Language Inference
+# 1.1 Indonesian: IndoNLI
+class IndoNLIScenario(Scenario):
+    """
+    IndoNLI is an Indonesian Natural Language Inference (NLI) scenario. The data is sourced from Wikipedia, news,
+    and web articles. Native speakers use premise text from these sources and write hypothesis sentences for each
+    NLI label. The labels are entailment, contradiction, or neutral.
+    The models are prompted using the following format:
+        Anda akan diberikan dua kalimat, X dan Y.
+        Tentukan mana dari pernyataan berikut ini yang paling sesuai untuk kalimat X dan Y.
+        A: Kalau X benar, maka Y juga harus benar.
+        B: X bertentangan dengan Y.
+        C: Ketika X benar, Y mungkin benar atau mungkin tidak benar.
+        Jawablah dengan satu huruf saja, A, B atau C.
+        X: <sentence1>
+        Y: <sentence2>
+        Jawaban: <entailment>
+        ...
+        X: <sentence1>
+        Y: <sentence2>
+        Jawaban:
+    Target completion:
+        <entailment>
+    @inproceedings{mahendra-etal-2021-indonli,
+        title = "{I}ndo{NLI}: A Natural Language Inference Dataset for {I}ndonesian",
+        author = "Mahendra, Rahmad and Aji, Alham Fikri and Louvan, Samuel and Rahman, Fahrurrozi and Vania, Clara",
+        booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
+        month = nov,
+        year = "2021",
+        address = "Online and Punta Cana, Dominican Republic",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2021.emnlp-main.821",
+        pages = "10511--10527",
+    }
+    """
+    name = "indonli"
+    description = "IndoNLI Indonesian Natural Language Inference task"
+    tags = ["natural_language_inference"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {
+            "train": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        self.id2label = {"e": "A", "c": "B", "n": "C"}
+    def download_dataset(self, output_path: str):
+        URLS = {
+            "train": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/train.jsonl",
+            "test": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/test_lay.jsonl",
+        }
+        dataset: Dict[str, pd.DataFrame] = {}
+        for split in self.splits.keys():
+            target_path_file = os.path.join(output_path, split)
+            ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
+            df = pd.read_json(target_path_file, lines=True)
+            if split == "test":
+                dataset[split] = df.groupby("label", group_keys=False).apply(
+                    lambda x: x.sample(frac=1000 / len(df), random_state=4685)
+                )
+            else:
+                dataset[split] = df
+        return dataset
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = self.download_dataset(output_path)
+        outputs = []
+        for split in self.splits.keys():
+            data = dataset[split]
+            for _, row in data.iterrows():
+                passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
+                input = Input(passage)
+                output = Output(self.id2label[row["label"]])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indonli",
+            display_name="IndoNLI",
+            short_display_name=None,
+            description="IndoNLI [(Mahendra, 2021)](https://aclanthology.org/2021.emnlp-main.821) is a "
+            "natural language inference dataset obtained from Wikipedia, news, and web "
+            "articles that incorporates various linguistic phenomena such as numerical "
+            "reasoning, structural changes, idioms, or temporal and spatial reasoning. \n",
+            taxonomy=TaxonomyInfo(
+                task="natural language inference",
+                what="Wikipedia, news, and web articles",
+                when="?",
+                who="?",
+                language="Indonesian",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
+# 1.2 Vietnamese & Thai: XNLI
+class XNLIScenario(Scenario):
+    """
+    XNLI is a Natural Language Inference scenario for 15 languages. The data was constructed following the
+    MultiNLI crowdsourcing procedure to obtain English data, which was then professionally translated across
+    14 other languages. Labels are entailment, neutral, or contradiction.
+    The models are prompted using the following general format:
+        You will be given two sentences, X and Y.
+        Determine which of the following statements applies to sentences X and Y the best.
+        A: If X is true, Y must be true.
+        B: X contradicts Y.
+        C: When X is true, Y may or may not be true.
+        Answer strictly with a single letter A, B or C.
+        X: <sentence1>
+        Y: <sentence2>
+        Answer: <entailment>
+        ...
+        X: <sentence1>
+        Y: <sentence2>
+        Answer:
+    Target completion:
+        <entailment>
+    @inproceedings{conneau-etal-2018-xnli,
+        title = "{XNLI}: Evaluating Cross-lingual Sentence Representations",
+        author = "Conneau, Alexis  and
+            Rinott, Ruty  and
+            Lample, Guillaume  and
+            Williams, Adina  and
+            Bowman, Samuel  and
+            Schwenk, Holger  and
+            Stoyanov, Veselin",
+        editor = "Riloff, Ellen  and
+            Chiang, David  and
+            Hockenmaier, Julia  and
+            Tsujii, Jun{'}ichi",
+        booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
+        month = oct # "-" # nov,
+        year = "2018",
+        address = "Brussels, Belgium",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/D18-1269",
+        doi = "10.18653/v1/D18-1269",
+        pages = "2475--2485",
+    }
+    """
+    name = "xnli"
+    description = "XNLI Natural Language Inference task"
+    tags = ["natural_language_inference"]
+    def __init__(self, language: str):
+        super().__init__()
+        self.language = language
+        self.splits = {
+            "validation": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        self.id2label = {0: "A", 2: "B", 1: "C"}
+        self.supported_languages = ["th", "vi"]
+        if self.language not in self.supported_languages:
+            raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = datasets.load_dataset("xnli", self.language)
+        outputs = []
+        for split in self.splits.keys():
+            df = dataset[split].to_pandas()
+            if split == "validation":
+                data = df
+            else:
+                # This produces 999 instances
+                data = df.groupby("label", group_keys=False).apply(
+                    lambda x: x.sample(frac=1000 / len(df), random_state=4156)
+                )
+                # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
+                remainder = df[~df.index.isin(data.index)]
+                neutral_instance = remainder[remainder["label"] == 1].iloc[0].to_frame().transpose()
+                data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
+            for _, row in data.iterrows():
+                passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
+                input = Input(passage)
+                output = Output(self.id2label[int(row["label"])])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"xnli_{self.language}",
+            display_name=f"XNLI ({self.language})",
+            short_display_name=None,
+            description="XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural "
+            "language inference dataset obtained from crowdsourced NLI data then "
+            "professionally translated across 14 other languages.\n",
+            taxonomy=TaxonomyInfo(
+                task="natural language inference",
+                what="crowdsourced NLI data professionally translated",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
+# 1.3 Tamil: IndicXNLI
+class IndicXNLIScenario(Scenario):
+    """
+    IndicXNLI is a Natural Language Inference scenario for 11 Indic languages. The data was
+    automatically translated from the English XNLI dataset into 11 Indic languages using
+    IndicTrans (Ramesh et al., 2021).
+    Only the Tamil subset of the data is used in this scenario. The labels are
+    entailment, contradiction and neutral.
+    The models are prompted using the following format:
+        உங்களுக்கு இரண்டு வாக்கியங்கள், X மற்றும் Y, தரப்படும்.
+        பின்வரும் கூற்றுகளில் எது X மற்றும் Y வாக்கியங்களுடன் மிகப் பொருந்துகிறது எனக் கண்டறியவும்.
+        A: X உண்மை என்றால் Y உம் உண்மையாக இருக்க வேண்டும்.
+        B: X உம் Y உம் முரண்படுகின்றன.
+        C: X உண்மையாக இருக்கும்போது Y உண்மையாக இருக்கலாம் அல்லது இல்லாமல் இருக்கலாம்.
+        A அல்லது B அல்லது C என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.
+        X: <premise>
+        Y: <hypothesis>
+        பதில்: <entailment>
+        ...
+        X: <premise>
+        Y: <hypothesis>
+        பதில்:
+    Target completion:
+        <entailment>
+    @inproceedings{aggarwal-etal-2022-indicxnli,
+        title = "{I}ndic{XNLI}: Evaluating Multilingual Inference for {I}ndian Languages",
+        author = "Aggarwal, Divyanshu  and
+            Gupta, Vivek  and
+            Kunchukuttan, Anoop",
+        editor = "Goldberg, Yoav  and
+            Kozareva, Zornitsa  and
+            Zhang, Yue",
+        booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
+        month = dec,
+        year = "2022",
+        address = "Abu Dhabi, United Arab Emirates",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/2022.emnlp-main.755",
+        doi = "10.18653/v1/2022.emnlp-main.755",
+        pages = "10994--11006",
+    }
+    """
+    name = "indicxnli"
+    description = "IndicXNLI Natural Language Inference task"
+    tags = ["natural_language_inference"]
+    def __init__(self):
+        super().__init__()
+        self.splits = {
+            "validation": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        self.id2label = {0: "A", 2: "B", 1: "C"}
+    def get_instances(self, output_path) -> List[Instance]:
+        dataset = datasets.load_dataset("Divyanshu/indicxnli", "ta")
+        outputs = []
+        for split in self.splits.keys():
+            df = dataset[split].to_pandas()
+            if split == "validation":
+                data = df
+            else:
+                # This produces 999 instances
+                data = df.groupby("label", group_keys=False).apply(
+                    lambda x: x.sample(frac=1000 / len(df), random_state=4156)
+                )
+                # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
+                remainder = df[~df.index.isin(data.index)]
+                neutral_instance = remainder[remainder["label"] == 2].iloc[0].to_frame().transpose()
+                data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
+            for _, row in data.iterrows():
+                passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
+                input = Input(passage)
+                output = Output(text=self.id2label[row["label"]])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name="indicxnli",
+            display_name="IndicXNLI",
+            short_display_name=None,
+            description="IndicXNLI is a Tamil sentiment analysis dataset that comes from IndicXTREME "
+            "[(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), which "
+            "automatically translated from XNLI into 11 Indic languages.\n",
+            taxonomy=TaxonomyInfo(
+                task="natural language inference",
+                what="crowdsourced NLI data professionally translated into Tamil",
+                when="?",
+                who="?",
+                language="Tamil",
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
+# 2. Causal Reasoning: XCOPA
+class XCOPAScenario(Scenario):
+    """
+    XCOPA is a commonsense causal reasoning scenario for 11 languages. The data is sourced from the English
+    COPA dataset and professionally translated across 11 languages to create a parallel dataset.
+    Only the Indonesian, Vietnamese, Thai and Tamil subsets were used for this scenario. Each instance consists of
+    a premise and two sentences. The system under test needs to determine which of the two sentences is more likely
+    to be the cause/effect of the premise. Whether the cause or the effect is asked for differs from instance to
+    instance. Although there should be an equal number of instances asking for the cause and for the effect, it was
+    found in the BHASA paper (Leong et al., 2023) that this was not the case for Indonesian and Thai. The
+    cause/effect label is fixed in this scenario by harmonizing the labels across the four languages based on the
+    Tamil subset as the reference.
+    The models are prompted using the following general format:
+        Based on the following situation, which of the following choices is most likely to be its {cause/effect}?
+        Answer only with a single letter A or B.
+        Situation: <premise>
+        A: <choice1>
+        B: <choice2>
+        Answer: <answer>
+        ...
+        Situation: <premise>
+        A: <choice1>
+        B: <choice2>
+        Answer:
+    Target completion:
+        <answer>
+    @article{ponti2020xcopa,
+    title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
+    author={Edoardo M. Ponti, Goran Glava
+    {s}, Olga Majewska, Qianchu Liu, Ivan Vuli'{c} and Anna Korhonen},
+    journal={arXiv preprint},
+    year={2020},
+    url={https://ducdauge.github.io/files/xcopa.pdf}
+    }
+    @inproceedings{roemmele2011choice,
+    title={Choice of plausible alternatives: An evaluation of commonsense causal reasoning},
+    author={Roemmele, Melissa and Bejan, Cosmin Adrian and Gordon, Andrew S},
+    booktitle={2011 AAAI Spring Symposium Series},
+    year={2011},
+    url={https://people.ict.usc.edu/~gordon/publications/AAAI-SPRING11A.PDF},
+    }
+    """
+    name = "xcopa"
+    description = "XCOPA causal reasoning task"
+    tags = ["causal_reasoning"]
+    def __init__(self, language: str):
+        super().__init__()
+        self.language = language
+        self.splits = {
+            "validation": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        self.id2label = {
+            0: "A",
+            1: "B",
+        }
+        self.language_to_prompt_components = {
+            "id": {
+                "cause": "sebab",
+                "effect": "akibat",
+                "instruction1": "Berdasarkan situasi di atas, mana dari pilihan-pilihan berikut ini yang lebih "
+                "mungkin menjadi {}?",
+                "instruction2": "Jawablah dengan satu huruf saja, A atau B.",
+            },
+            "ta": {
+                "cause": "காரணமாக",
+                "effect": "விளைவாக",
+                "instruction1": "பின்வரும் வாக்கியங்களில் பெரும்பாலும் எது தரப்பட்ட சூழ்நிலைக்குரிய {} இருக்கும்?",
+                "instruction2": "A அல்லது B என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.",
+            },
+            "th": {
+                "cause": "สาเหตุ",
+                "effect": "ผล",
+                "instruction1": "เมื่อพิจารณาจากสถานการณ์นี้ ตัวเลือกใดต่อไปนี้น่าจะเป็น{}มากกว่ากัน?",
+                "instruction2": "กรุณาตอบด้วยตัวอักษร A หรือ B ตัวเดียวเท่านั้น",
+            },
+            "vi": {
+                "cause": "nguyên nhân",
+                "effect": "kết quả",
+                "instruction1": "Với tình huống trên, lựa chọn nào dưới đây có khả năng cao là {} của nó hơn?",
+                "instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
+            },
+        }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
+    def get_instances(self, output_path) -> List[Instance]:
+        language_dataset = datasets.load_dataset("xcopa", self.language)
+        tamil_dataset = datasets.load_dataset("xcopa", "ta")
+        outputs = []
+        for split in self.splits.keys():
+            language_df = language_dataset[split].to_pandas()
+            tamil_df = tamil_dataset[split].to_pandas()
+            data = pd.merge(
+                language_df, tamil_df[["question", "idx"]], on="idx"
+            )  # Use the Tamil split's question column
+            for _, row in data.iterrows():
+                instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
+                passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
+                    premise=row["premise"].strip(),
+                    instruction1=instruction1,
+                    choice1=row["choice1"].strip(),
+                    choice2=row["choice2"].strip(),
+                    instruction2=self.prompt_components["instruction2"],
+                )
+                input = Input(passage)
+                output = Output(self.id2label[int(row["label"])])
+                references = [
+                    Reference(output, tags=[CORRECT_TAG]),
+                ]
+                instance = Instance(input=input, references=references, split=self.splits[split])
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"xcopa_{self.language}",
+            display_name=f"XCOPA ({self.language})",
+            short_display_name=None,
+            description="XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal "
+            "reasoning dataset, a translation and reannotation of the English COPA. English "
+            "COPA included questions that directly assess commonsense causal reasoning.\n",
+            taxonomy=TaxonomyInfo(
+                task="causal reasoning",
+                what="commonsense causal reasoning questions translated into " "Indonesian",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
+# 1. Syntax: LINDSEA Minimal Pairs
+class LINDSEASyntaxMinimalPairsScenario(Scenario):
+    """
+    The LINDSEA Minimal Pairs dataset is a linguistic diagnostic scenario targeting syntactic phenomena.
+    The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
+    of quality control. The high-level categories tested for include morphology, argument structure,
+    filler-gap dependencies, as well as negative polarity items and negation.
+    The test is designed as a minimal pair, with a pair of sentences that differ minimally from each other
+    and which exemplify a specific syntactic phenomenon. The system under test needs to determine which
+    sentence of the pair is more acceptable.
+    The models are prompted using the following general format:
+        Which sentence is more acceptable?
+        Answer only with a single letter A or B.
+        <sentence>
+    Target completion:
+        <sentence>
+    @misc{leong2023bhasa,
+        title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
+        author={Wei Qi Leong
+            and Jian Gang Ngui
+            and Yosephine Susanto
+            and Hamsawardhini Rengarajan
+            and Kengatharaiyer Sarveswaran
+            and William Chandra Tjhi
+        },
+        year={2023},
+        eprint={2309.06085},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL},
+        url={https://arxiv.org/abs/2309.06085},
+    }
+    """
+    name = "lindsea_minimal_pairs"
+    description = "LINDSEA minimal pairs task"
+    tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
+    def __init__(self, method: str, language: str):
+        super().__init__()
+        self.method = method
+        self.language = language
+        self.language_to_prompt_components = {
+            "id": {
+                "instructions": "Kalimat mana yang lebih mungkin?",
+                "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
+            }
+        }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
+    def download_dataset(self, output_path: str):
+        BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
+        URLS = {
+            "npis_and_negation": f"{BASE_URL}{self.language}/syntax/NPIs_and_negation.jsonl",
+            "argument_structure": f"{BASE_URL}{self.language}/syntax/argument_structure.jsonl",
+            "filler_gap_dependencies": f"{BASE_URL}{self.language}/syntax/filler-gap_dependencies.jsonl",
+            "morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl",
+        }
+        data_files = {}
+        for file in list(URLS.keys()):
+            target_path_file = os.path.join(output_path, file)
+            ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file)
+            data_files[file] = pd.read_json(target_path_file, lines=True)
+        dataset = pd.concat(data_files)
+        return dataset
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data = self.download_dataset(output_path)
+        outputs = []
+        if self.method == "mcq":
+            category_list = data["category"].value_counts().keys()
+            hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
+            for category in category_list:
+                # Fix shuffling within each category
+                random.seed(1)
+                for _, row in data[data["category"] == category].iterrows():
+                    options = [(row["correct"], 1), (row["wrong"], 2)]
+                    random.shuffle(options)
+                    options_reversed = True if options[0][1] == 2 else False
+                    instructions = self.prompt_components["instructions"]
+                    output_prefix = self.prompt_components["output_prefix"]
+                    prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
+                    input = Input(text=prompt)
+                    # Determine correct option based on whether shuffling reversed the options
+                    references = [
+                        Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]),
+                        Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []),
+                    ]
+                    instance = Instance(input=input, references=references, split=TEST_SPLIT)
+                    outputs.append(instance)
+        else:
+            for _, row in data.iterrows():
+                # No need to shuffle since we are comparing logprobs of the options separately
+                input = Input(text="")
+                references = [
+                    Reference(Output(text=row["correct"].strip()), tags=[CORRECT_TAG]),
+                    Reference(Output(text=row["wrong"].strip()), tags=[]),
+                ]
+                instance = Instance(
+                    input=input,
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+                outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"lindsea_syntax_minimal_pairs_{self.language}",
+            display_name="LINDSEA Syntax Minimal Pairs",
+            short_display_name=None,
+            description="LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA "
+            "[(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of "
+            "sentences that differ minimally from each other and contrast in grammatical "
+            "acceptability.\n",
+            taxonomy=TaxonomyInfo(
+                task="minimal pairs",
+                what="sentence pairs with minimal differences and constrasting " "grammatical acceptability",
+                when="?",
+                who="?",
+                language=self.language,
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
+# 2.1 Pragmatics: LINDSEA Presuppositions
+class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
+    """
+    The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
+    The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
+    of quality control.
+    The presuppositions dataset involves two formats: single and pair sentences.
+    For single sentence questions, the system under test needs to determine if the sentence is true/false.
+    For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
+    from another sentence.
+    For the single format, the models are prompted using the following general format:
+        Is the following statement true or false?
+        Statement: <sentence>
+        Answer only with True or False.
+    For the pair format, the models are prompted using the following general format:
+        Situation: <premise>
+        Given this situation, is the following statement true or false?
+        Statement: <hypothesis>
+        Answer only with True or False.
+    Target completion:
+        <answer>
+    @misc{leong2023bhasa,
+        title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
+        author={Wei Qi Leong
+            and Jian Gang Ngui
+            and Yosephine Susanto
+            and Hamsawardhini Rengarajan
+            and Kengatharaiyer Sarveswaran
+            and William Chandra Tjhi
+        },
+        year={2023},
+        eprint={2309.06085},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL}
+    }
+    """
+    name = "lindsea_pragmatics_presuppositions"
+    description = "LINDSEA presuppositions task"
+    tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
+    def __init__(self, language: str, subset: str):
+        super().__init__()
+        self.language = language
+        self.subsets = [subset] if subset != "all" else ["single", "pair"]
+        self.language_to_prompt_components = {
+            "id": {
+                "text_noun": "Pernyataan",
+                "premise_noun": "Situasi",
+                "conclusion_noun": "Pernyataan",
+                "single_question": "Apakah pernyataan berikut ini {}?",
+                "single_instruction": "Jawablah dengan {} saja.",
+                "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
+                "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
+                "True": "Benar",
+                "False": "Salah",
+            },
+        }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
+    def download_dataset(self, output_path: str):
+        BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
+        datasets = []
+        for subset in self.subsets:
+            URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
+            file = f"pragmatic_reasoning_{subset}.jsonl"
+            target_path_file = os.path.join(output_path, file)
+            ensure_file_downloaded(source_url=URL, target_path=target_path_file)
+            data = pd.read_json(target_path_file, lines=True)
+            data["subset"] = subset
+            data = data[data["linguistic_phenomenon"] == "presuppositions"]
+            datasets.append(data)
+        dataset = pd.concat(datasets)
+        return dataset
+    def get_instances(self, output_path) -> List[Instance]:
+        data = self.download_dataset(output_path)
+        outputs = []
+        for _, row in data.iterrows():
+            passage = None
+            references = []
+            if row["subset"] == "single":
+                question = self.prompt_components["single_question"]
+                text_noun = self.prompt_components["text_noun"]
+                instruction = self.prompt_components["single_instruction"]
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
+                    question=question.format(row["question_translated"]),
+                    text_noun=text_noun,
+                    text=row["text"],
+                    instruction=instruction.format(row["choices_translated"]),
+                )
+                # Split "True or False" into ["True", "or", "False"]
+                choices = row["choices"].split()
+                choices_translated = row["choices_translated"].split()
+                label2choice = {
+                    choices[0]: choices_translated[0],
+                    choices[2]: choices_translated[2],
+                }
+                references.append(
+                    Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
+                )
+            elif row["subset"] == "pair":
+                premise_noun = self.prompt_components["premise_noun"]
+                question = self.prompt_components["pair_question"]
+                conclusion_noun = self.prompt_components["conclusion_noun"]
+                instruction = self.prompt_components["pair_instruction"]
+                label = self.prompt_components[str(row["label"])]
+                passage = (
+                    "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
+                        premise_noun=premise_noun,
+                        premise=row["text"],
+                        question=question,
+                        conclusion_noun=conclusion_noun,
+                        conclusion=row["conclusion"],
+                        instruction=instruction,
+                    )
+                )
+                references.append(
+                    Reference(Output(text=label), tags=[CORRECT_TAG]),
+                )
+            input = Input(text=str(passage))
+            instance = Instance(
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+            )
+            outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"lindsea_pragmatics_presuppositions_{self.language}",
+            display_name="LINDSEA Pragmatics Presuppositions",
+            short_display_name=None,
+            description="LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics "
+            "dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), "
+            "involving two formats: single and pair sentences. For single sentence "
+            "questions, the system under test needs to determine if the sentence is "
+            "true/false. For pair sentence questions, the system under test needs to "
+            "determine whether a conclusion can be drawn from another sentence.\n",
+            taxonomy=TaxonomyInfo(
+                task="pragmatic reasoning", what="presuppositions", when="?", who="?", language=self.language
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )
+# 2.2 Pragmatics: LINDSEA Scalar Implicatures
+class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
+    """
+    The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
+    The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
+    of quality control.
+    The scalar implicatures dataset involves two formats: single and pair sentences.
+    For single sentence questions, the system under test needs to determine if the sentence is true/false.
+    For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
+    from another sentence.
+    For the single format, the models are prompted using the following general format:
+        Is the following statement true or false?
+        Statement: <sentence>
+        Answer only with True or False.
+    For the pair format, the models are prompted using the following general format:
+        Situation: <premise>
+        Given this situation, is the following statement true or false?
+        Statement: <hypothesis>
+        Answer only with True or False.
+    Target completion:
+        <answer>
+    @misc{leong2023bhasa,
+        title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
+        author={Wei Qi Leong
+            and Jian Gang Ngui
+            and Yosephine Susanto
+            and Hamsawardhini Rengarajan
+            and Kengatharaiyer Sarveswaran
+            and William Chandra Tjhi
+        },
+        year={2023},
+        eprint={2309.06085},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL}
+    }
+    """
+    name = "lindsea_pragmatics_scalar_implicatures"
+    description = "LINDSEA scalar implicatures task"
+    tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
+    def __init__(self, language: str, subset: str):
+        super().__init__()
+        self.language = language
+        self.subsets = [subset] if subset != "all" else ["single", "pair"]
+        self.language_to_prompt_components = {
+            "id": {
+                "text_noun": "Pernyataan",
+                "premise_noun": "Situasi",
+                "conclusion_noun": "Pernyataan",
+                "single_question": "Apakah pernyataan berikut ini {}?",
+                "single_instruction": "Jawablah dengan {} saja.",
+                "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
+                "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
+                "True": "Benar",
+                "False": "Salah",
+            },
+        }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
+    def download_dataset(self, output_path: str):
+        BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
+        datasets = []
+        for subset in self.subsets:
+            URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
+            file = f"pragmatic_reasoning_{subset}.jsonl"
+            target_path_file = os.path.join(output_path, file)
+            ensure_file_downloaded(source_url=URL, target_path=target_path_file)
+            data = pd.read_json(target_path_file, lines=True)
+            data["subset"] = subset
+            data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
+            datasets.append(data)
+        dataset = pd.concat(datasets)
+        return dataset
+    def get_instances(self, output_path) -> List[Instance]:
+        data = self.download_dataset(output_path)
+        outputs = []
+        for _, row in data.iterrows():
+            passage = None
+            references = []
+            if row["subset"] == "single":
+                question = self.prompt_components["single_question"]
+                text_noun = self.prompt_components["text_noun"]
+                instruction = self.prompt_components["single_instruction"]
+                passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
+                    question=question.format(row["question_translated"]),
+                    text_noun=text_noun,
+                    text=row["text"],
+                    instruction=instruction.format(row["choices_translated"]),
+                )
+                # Split "True or False" into ["True", "or", "False"]
+                choices = row["choices"].split()
+                choices_translated = row["choices_translated"].split()
+                label2choice = {
+                    choices[0]: choices_translated[0],
+                    choices[2]: choices_translated[2],
+                }
+                references.append(
+                    Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
+                )
+            elif row["subset"] == "pair":
+                premise_noun = self.prompt_components["premise_noun"]
+                question = self.prompt_components["pair_question"]
+                conclusion_noun = self.prompt_components["conclusion_noun"]
+                instruction = self.prompt_components["pair_instruction"]
+                label = self.prompt_components[str(row["label"])]
+                passage = (
+                    "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
+                        premise_noun=premise_noun,
+                        premise=row["text"],
+                        question=question,
+                        conclusion_noun=conclusion_noun,
+                        conclusion=row["conclusion"],
+                        instruction=instruction,
+                    )
+                )
+                references.append(
+                    Reference(Output(text=label), tags=[CORRECT_TAG]),
+                )
+            input = Input(text=str(passage))
+            instance = Instance(
+                input=input,
+                references=references,
+                split=TEST_SPLIT,
+            )
+            outputs.append(instance)
+        return outputs
+    def get_metadata(self) -> ScenarioMetadata:
+        return ScenarioMetadata(
+            name=f"lindsea_pragmatics_scalar_implicatures_{self.language}",
+            display_name="LINDSEA Pragmatics Scalar Implicatures",
+            short_display_name=None,
+            description="LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for "
+            "pragmatics dataset from BHASA [(Leong, "
+            "2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and "
+            "pair sentences. For single sentence questions, the system under test needs to "
+            "determine if the sentence is true/false. For pair sentence questions, the "
+            "system under test needs to determine whether a conclusion can be drawn from "
+            "another sentence.\n",
+            taxonomy=TaxonomyInfo(
+                task="pragmatic reasoning", what="scalar implicatures", when="?", who="?", language=self.language
+            ),
+            main_metric="exact_match",
+            main_split="test",
+        )

crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.4.0py3-none-any.whl → 0.5.10py3-none-any.whl