crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show
  1. crfm_helm-0.5.10.dist-info/METADATA +369 -0
  2. crfm_helm-0.5.10.dist-info/RECORD +1008 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +80 -29
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  8. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  9. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  10. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
  11. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
  12. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
  13. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  14. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
  15. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  16. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  17. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
  18. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
  19. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  20. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
  21. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  22. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  23. helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
  24. helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
  25. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
  26. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
  27. helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  28. helm/benchmark/adaptation/prompt.py +1 -1
  29. helm/benchmark/adaptation/request_state.py +6 -1
  30. helm/benchmark/adaptation/scenario_state.py +6 -2
  31. helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  32. helm/benchmark/annotation/air_bench_annotator.py +79 -0
  33. helm/benchmark/annotation/alrage_annotator.py +90 -0
  34. helm/benchmark/annotation/annotator.py +48 -0
  35. helm/benchmark/annotation/annotator_factory.py +50 -0
  36. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  37. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  38. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  39. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  40. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  41. helm/benchmark/annotation/call_center_annotator.py +258 -0
  42. helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  43. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  44. helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  45. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  46. helm/benchmark/annotation/financebench_annotator.py +79 -0
  47. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  48. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  49. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  50. helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  51. helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  52. helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
  53. helm/benchmark/annotation/live_qa_annotator.py +76 -0
  54. helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  55. helm/benchmark/annotation/medalign_annotator.py +89 -0
  56. helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  57. helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  58. helm/benchmark/annotation/mental_health_annotator.py +87 -0
  59. helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  60. helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  61. helm/benchmark/annotation/model_as_judge.py +309 -0
  62. helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  63. helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  64. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  65. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  66. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  67. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  68. helm/benchmark/annotation/spider_annotator.py +18 -0
  69. helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  70. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  71. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  72. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  73. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  74. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  75. helm/benchmark/annotation/xstest_annotator.py +100 -0
  76. helm/benchmark/annotation_executor.py +144 -0
  77. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  78. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  79. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  80. helm/benchmark/augmentations/data_augmenter.py +0 -2
  81. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  82. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  83. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  84. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  85. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  86. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  87. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  88. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  89. helm/benchmark/augmentations/perturbation.py +26 -4
  90. helm/benchmark/augmentations/perturbation_description.py +1 -1
  91. helm/benchmark/augmentations/space_perturbation.py +2 -2
  92. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  93. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  94. helm/benchmark/augmentations/test_perturbation.py +56 -19
  95. helm/benchmark/augmentations/translate_perturbation.py +31 -0
  96. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  97. helm/benchmark/config_registry.py +7 -1
  98. helm/benchmark/data_preprocessor.py +2 -2
  99. helm/benchmark/executor.py +54 -25
  100. helm/benchmark/huggingface_registration.py +28 -10
  101. helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  102. helm/benchmark/metrics/alrage_metric.py +35 -0
  103. helm/benchmark/metrics/annotation_metrics.py +108 -0
  104. helm/benchmark/metrics/basic_metrics.py +437 -667
  105. helm/benchmark/metrics/bbq_metrics.py +17 -6
  106. helm/benchmark/metrics/bias_metrics.py +18 -9
  107. helm/benchmark/metrics/bias_word_lists.py +1 -1
  108. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  109. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  110. helm/benchmark/metrics/classification_metrics.py +107 -22
  111. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  112. helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
  113. helm/benchmark/metrics/code_metrics.py +5 -5
  114. helm/benchmark/metrics/code_metrics_helper.py +11 -3
  115. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  116. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  117. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  118. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  119. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  120. helm/benchmark/metrics/comet_metric.py +125 -0
  121. helm/benchmark/metrics/common_metric_specs.py +174 -0
  122. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  123. helm/benchmark/metrics/copyright_metrics.py +5 -5
  124. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  125. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  126. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  127. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  128. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  129. helm/benchmark/metrics/disinformation_metrics.py +8 -114
  130. helm/benchmark/metrics/dry_run_metrics.py +35 -6
  131. helm/benchmark/metrics/efficiency_metrics.py +287 -0
  132. helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  133. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  134. helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  135. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  136. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  137. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  138. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  139. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  140. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  141. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
  142. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  143. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  144. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  145. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  146. helm/benchmark/metrics/ifeval_metrics.py +67 -0
  147. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  148. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  149. helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  150. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  151. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  152. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  153. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  154. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  155. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  156. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  157. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  158. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  159. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  160. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  161. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  162. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  163. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  164. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  165. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  166. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  167. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  168. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  169. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  170. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  171. helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  172. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  173. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  174. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  175. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  176. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  177. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  178. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  179. helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
  180. helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  181. helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  182. helm/benchmark/metrics/live_qa_metrics.py +35 -0
  183. helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  184. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  185. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  186. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  187. helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  188. helm/benchmark/metrics/medec_metrics.py +124 -0
  189. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  190. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  191. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  192. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  193. helm/benchmark/metrics/metric.py +121 -175
  194. helm/benchmark/metrics/metric_name.py +0 -1
  195. helm/benchmark/metrics/metric_service.py +23 -7
  196. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  197. helm/benchmark/metrics/nltk_helper.py +32 -0
  198. helm/benchmark/metrics/omni_math_metrics.py +44 -0
  199. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  200. helm/benchmark/metrics/output_processing_metric.py +60 -0
  201. helm/benchmark/metrics/output_processors.py +15 -0
  202. helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
  203. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  204. helm/benchmark/metrics/ranking_metrics.py +5 -5
  205. helm/benchmark/metrics/reference_metric.py +148 -0
  206. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  207. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  208. helm/benchmark/metrics/safety_metrics.py +91 -0
  209. helm/benchmark/metrics/seahelm_metrics.py +201 -0
  210. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  211. helm/benchmark/metrics/spider_metrics.py +7 -0
  212. helm/benchmark/metrics/statistic.py +1 -1
  213. helm/benchmark/metrics/summac/model_summac.py +8 -11
  214. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  215. helm/benchmark/metrics/summarization_metrics.py +150 -11
  216. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  217. helm/benchmark/metrics/test_classification_metrics.py +145 -70
  218. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  219. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
  220. helm/benchmark/metrics/test_metric.py +3 -3
  221. helm/benchmark/metrics/test_statistic.py +2 -2
  222. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  223. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  224. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  225. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  226. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
  227. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  228. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
  229. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
  230. helm/benchmark/metrics/toxicity_metrics.py +37 -7
  231. helm/benchmark/metrics/toxicity_utils.py +23 -0
  232. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  233. helm/benchmark/metrics/unitxt_metrics.py +107 -0
  234. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  235. helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  236. helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  237. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  238. helm/benchmark/metrics/wildbench_metrics.py +54 -0
  239. helm/benchmark/model_deployment_registry.py +69 -5
  240. helm/benchmark/model_metadata_registry.py +58 -2
  241. helm/benchmark/multi_gpu_runner.py +133 -0
  242. helm/benchmark/presentation/contamination.py +3 -3
  243. helm/benchmark/presentation/create_plots.py +51 -20
  244. helm/benchmark/presentation/run_display.py +51 -12
  245. helm/benchmark/presentation/run_entry.py +2 -2
  246. helm/benchmark/presentation/schema.py +83 -66
  247. helm/benchmark/presentation/summarize.py +483 -388
  248. helm/benchmark/presentation/table.py +8 -8
  249. helm/benchmark/presentation/taxonomy_info.py +20 -0
  250. helm/benchmark/presentation/test_contamination.py +2 -2
  251. helm/benchmark/presentation/test_create_plots.py +4 -1
  252. helm/benchmark/presentation/test_run_entry.py +2 -2
  253. helm/benchmark/presentation/test_schema.py +11 -0
  254. helm/benchmark/presentation/test_summarize.py +148 -6
  255. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  256. helm/benchmark/reeval_run.py +202 -0
  257. helm/benchmark/reeval_runner.py +355 -0
  258. helm/benchmark/run.py +151 -87
  259. helm/benchmark/run_expander.py +418 -33
  260. helm/benchmark/run_spec.py +93 -0
  261. helm/benchmark/run_spec_factory.py +180 -0
  262. helm/benchmark/run_specs/__init__.py +0 -0
  263. helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  264. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  265. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  266. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  267. helm/benchmark/run_specs/call_center_run_specs.py +201 -0
  268. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  269. helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  270. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  271. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  272. helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  273. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  274. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  275. helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  276. helm/benchmark/run_specs/finance_run_specs.py +114 -0
  277. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  278. helm/benchmark/run_specs/heim_run_specs.py +625 -0
  279. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  280. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  281. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  282. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  283. helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  284. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  285. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  286. helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
  287. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  288. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  289. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  290. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  291. helm/benchmark/run_specs/safety_run_specs.py +191 -0
  292. helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  293. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  294. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  295. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  296. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  297. helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  298. helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  299. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  300. helm/benchmark/runner.py +63 -62
  301. helm/benchmark/runner_config_registry.py +21 -0
  302. helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  303. helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  304. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  305. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  306. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
  307. helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  308. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  309. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  310. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  311. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  312. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  313. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  314. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  315. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  316. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  317. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  318. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  319. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  320. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  321. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  322. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  323. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  324. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  325. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  326. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  327. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  328. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  329. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  330. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  331. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  332. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  333. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  334. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  335. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  336. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  337. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  338. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  339. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  340. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  341. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  342. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  343. helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
  344. helm/benchmark/scenarios/banking77_scenario.py +77 -0
  345. helm/benchmark/scenarios/bbq_scenario.py +17 -2
  346. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  347. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  348. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  349. helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  350. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  351. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  352. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  353. helm/benchmark/scenarios/bold_scenario.py +18 -3
  354. helm/benchmark/scenarios/boolq_scenario.py +21 -1
  355. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  356. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  357. helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  358. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  359. helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
  360. helm/benchmark/scenarios/clear_scenario.py +180 -0
  361. helm/benchmark/scenarios/cleva_scenario.py +482 -3
  362. helm/benchmark/scenarios/code_scenario.py +46 -4
  363. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  364. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  365. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  366. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  367. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  368. helm/benchmark/scenarios/commonsense_scenario.py +33 -1
  369. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  370. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  371. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  372. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  373. helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  374. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  375. helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  376. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  377. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  378. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  379. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  380. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  381. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  382. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  383. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  384. helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
  385. helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  386. helm/benchmark/scenarios/disinformation_scenario.py +32 -1
  387. helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
  388. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  389. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  390. helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  391. helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  392. helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
  393. helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
  394. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  395. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  396. helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  397. helm/benchmark/scenarios/financebench_scenario.py +74 -0
  398. helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  399. helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  400. helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  401. helm/benchmark/scenarios/grammar.py +2 -2
  402. helm/benchmark/scenarios/grammar_scenario.py +21 -2
  403. helm/benchmark/scenarios/gsm_scenario.py +31 -1
  404. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  405. helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  406. helm/benchmark/scenarios/headqa_scenario.py +158 -0
  407. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  408. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
  409. helm/benchmark/scenarios/ice_scenario.py +28 -4
  410. helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  411. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  412. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  413. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  414. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  415. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  416. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  417. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  418. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  419. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  420. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  421. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  422. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  423. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  424. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  425. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  426. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  427. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  428. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  429. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  430. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  431. helm/benchmark/scenarios/imdb_scenario.py +26 -3
  432. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  433. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  434. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  435. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  436. helm/benchmark/scenarios/koala_scenario.py +21 -1
  437. helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  438. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  439. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  440. helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
  441. helm/benchmark/scenarios/legal_support_scenario.py +24 -1
  442. helm/benchmark/scenarios/legalbench_scenario.py +45 -3
  443. helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
  444. helm/benchmark/scenarios/lextreme_scenario.py +22 -1
  445. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  446. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  447. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  448. helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
  449. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  450. helm/benchmark/scenarios/math_scenario.py +81 -22
  451. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  452. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  453. helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
  454. helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
  455. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  456. helm/benchmark/scenarios/med_qa_scenario.py +30 -1
  457. helm/benchmark/scenarios/medalign_scenario.py +117 -0
  458. helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  459. helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  460. helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
  461. helm/benchmark/scenarios/medec_scenario.py +148 -0
  462. helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  463. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  464. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  465. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  466. helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  467. helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  468. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  469. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  470. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  471. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  472. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  473. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  474. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  475. helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  476. helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  477. helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  478. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  479. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  480. helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  481. helm/benchmark/scenarios/mmlu_scenario.py +32 -1
  482. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  483. helm/benchmark/scenarios/msmarco_scenario.py +31 -1
  484. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  485. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  486. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  487. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
  488. helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
  489. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  490. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  491. helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  492. helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
  493. helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  494. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
  495. helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
  496. helm/benchmark/scenarios/quac_scenario.py +24 -1
  497. helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  498. helm/benchmark/scenarios/raft_scenario.py +33 -3
  499. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  500. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  501. helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  502. helm/benchmark/scenarios/scenario.py +44 -1
  503. helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  504. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  505. helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  506. helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  507. helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  508. helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  509. helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  510. helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  511. helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  512. helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  513. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  514. helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  515. helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  516. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  517. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  518. helm/benchmark/scenarios/spider_scenario.py +109 -0
  519. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  520. helm/benchmark/scenarios/summarization_scenario.py +48 -1
  521. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  522. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  523. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
  524. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  525. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  526. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  527. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  528. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  529. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  530. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  531. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  532. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  533. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  534. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  535. helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  536. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  537. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  538. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  539. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  540. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  541. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  542. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  543. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  544. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  545. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  546. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  547. helm/benchmark/scenarios/test_math_scenario.py +4 -3
  548. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  549. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  550. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  551. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  552. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  553. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  554. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  555. helm/benchmark/scenarios/test_scenario.py +6 -3
  556. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  557. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  558. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  559. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  560. helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  561. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  562. helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
  563. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  564. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  565. helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  566. helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
  567. helm/benchmark/scenarios/vicuna_scenario.py +22 -2
  568. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  569. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  570. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  571. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  572. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  573. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  574. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  575. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  576. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  577. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  578. helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  579. helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  580. helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  581. helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  582. helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  583. helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  584. helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  585. helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  586. helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  587. helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  588. helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  589. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  590. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  591. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  592. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  593. helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  594. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  595. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  596. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  597. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  598. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  599. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  600. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  601. helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  602. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  603. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  604. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  605. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  606. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
  607. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  608. helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
  609. helm/benchmark/scenarios/wikifact_scenario.py +31 -1
  610. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  611. helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  612. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  613. helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
  614. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  615. helm/benchmark/server.py +32 -2
  616. helm/benchmark/slurm_jobs.py +1 -2
  617. helm/benchmark/slurm_runner.py +78 -50
  618. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  619. helm/benchmark/static/schema_arabic.yaml +271 -0
  620. helm/benchmark/static/schema_audio.yaml +763 -0
  621. helm/benchmark/static/schema_autobencher.yaml +150 -0
  622. helm/benchmark/static/schema_call_center.yaml +269 -0
  623. helm/benchmark/static/schema_capabilities.yaml +254 -0
  624. helm/benchmark/static/schema_classic.yaml +259 -1140
  625. helm/benchmark/static/schema_cleva.yaml +768 -0
  626. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  627. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  628. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  629. helm/benchmark/static/schema_enterprise.yaml +319 -0
  630. helm/benchmark/static/schema_ewok.yaml +367 -0
  631. helm/benchmark/static/schema_finance.yaml +191 -0
  632. helm/benchmark/static/schema_heim.yaml +1389 -0
  633. helm/benchmark/static/schema_image2struct.yaml +588 -0
  634. helm/benchmark/static/schema_instruction_following.yaml +161 -0
  635. helm/benchmark/static/schema_legal.yaml +566 -0
  636. helm/benchmark/static/schema_lite.yaml +3 -286
  637. helm/benchmark/static/schema_long_context.yaml +282 -0
  638. helm/benchmark/static/schema_medhelm.yaml +1176 -0
  639. helm/benchmark/static/schema_melt.yaml +1257 -0
  640. helm/benchmark/static/schema_mmlu.yaml +1449 -0
  641. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  642. helm/benchmark/static/schema_safety.yaml +283 -0
  643. helm/benchmark/static/schema_seahelm.yaml +723 -0
  644. helm/benchmark/static/schema_slp.yaml +219 -0
  645. helm/benchmark/static/schema_slphelm.yaml +162 -0
  646. helm/benchmark/static/schema_social_audio.yaml +224 -0
  647. helm/benchmark/static/schema_sql.yaml +171 -0
  648. helm/benchmark/static/schema_thai.yaml +244 -0
  649. helm/benchmark/static/schema_torr.yaml +474 -0
  650. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  651. helm/benchmark/static/schema_unitxt.yaml +370 -0
  652. helm/benchmark/static/schema_vhelm.yaml +933 -0
  653. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  654. helm/benchmark/static/schema_video.yaml +219 -0
  655. helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  656. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  657. helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  658. helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  659. helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  660. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  661. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  662. helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  663. helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  664. helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  665. helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  666. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  667. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  668. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  669. helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  670. helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  671. helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  672. helm/benchmark/static_build/config.js +4 -0
  673. helm/benchmark/static_build/index.html +19 -0
  674. helm/benchmark/test_data_preprocessor.py +3 -3
  675. helm/benchmark/test_run_expander.py +1 -1
  676. helm/benchmark/window_services/default_window_service.py +3 -45
  677. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
  678. helm/benchmark/window_services/ice_window_service.py +1 -35
  679. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  680. helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  681. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  682. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  683. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  684. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  685. helm/benchmark/window_services/local_window_service.py +22 -5
  686. helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
  687. helm/benchmark/window_services/test_bloom_window_service.py +5 -4
  688. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  689. helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
  690. helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
  691. helm/benchmark/window_services/test_gptj_window_service.py +11 -5
  692. helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
  693. helm/benchmark/window_services/test_openai_window_service.py +18 -12
  694. helm/benchmark/window_services/test_opt_window_service.py +6 -5
  695. helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
  696. helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
  697. helm/benchmark/window_services/test_t511b_window_service.py +5 -4
  698. helm/benchmark/window_services/test_ul2_window_service.py +5 -4
  699. helm/benchmark/window_services/test_utils.py +6 -6
  700. helm/benchmark/window_services/test_yalm_window_service.py +5 -4
  701. helm/benchmark/window_services/tokenizer_service.py +7 -13
  702. helm/benchmark/window_services/window_service.py +42 -0
  703. helm/benchmark/window_services/window_service_factory.py +4 -1
  704. helm/benchmark/window_services/yalm_window_service.py +1 -28
  705. helm/clients/__init__.py +0 -0
  706. helm/{proxy/clients → clients}/ai21_client.py +78 -12
  707. helm/clients/aleph_alpha_client.py +114 -0
  708. helm/{proxy/clients → clients}/anthropic_client.py +304 -21
  709. helm/clients/audio_language/__init__.py +0 -0
  710. helm/clients/audio_language/diva_llama_client.py +122 -0
  711. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  712. helm/clients/audio_language/llama_omni/constants.py +9 -0
  713. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  714. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  715. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  716. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  717. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  718. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  719. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  720. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  721. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  722. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  723. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  724. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  725. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  726. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  727. helm/clients/audio_language/llama_omni/utils.py +202 -0
  728. helm/clients/audio_language/llama_omni_client.py +199 -0
  729. helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  730. helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  731. helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  732. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  733. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  734. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  735. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  736. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  737. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  738. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  739. helm/clients/audio_language/test.py +62 -0
  740. helm/{proxy/clients → clients}/auto_client.py +72 -31
  741. helm/clients/azure_openai_client.py +55 -0
  742. helm/clients/bedrock_client.py +381 -0
  743. helm/clients/bedrock_utils.py +105 -0
  744. helm/{proxy/clients → clients}/client.py +92 -17
  745. helm/clients/clip_score_client.py +49 -0
  746. helm/clients/clip_scorers/__init__.py +0 -0
  747. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  748. helm/clients/clip_scorers/clip_scorer.py +50 -0
  749. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  750. helm/{proxy/clients → clients}/cohere_client.py +105 -14
  751. helm/clients/dspy_client.py +135 -0
  752. helm/clients/gcs_client.py +82 -0
  753. helm/{proxy/clients → clients}/google_client.py +8 -6
  754. helm/clients/google_translate_client.py +35 -0
  755. helm/clients/grok_client.py +36 -0
  756. helm/{proxy/clients → clients}/http_model_client.py +8 -8
  757. helm/{proxy/clients → clients}/huggingface_client.py +157 -86
  758. helm/clients/huggingface_pipeline_client.py +138 -0
  759. helm/clients/ibm_client.py +269 -0
  760. helm/clients/image_generation/__init__.py +0 -0
  761. helm/clients/image_generation/adobe_vision_client.py +80 -0
  762. helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  763. helm/clients/image_generation/cogview2/__init__.py +0 -0
  764. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  765. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  766. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  767. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  768. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  769. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  770. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  771. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  772. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  773. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  774. helm/clients/image_generation/cogview2_client.py +192 -0
  775. helm/clients/image_generation/dalle2_client.py +194 -0
  776. helm/clients/image_generation/dalle3_client.py +108 -0
  777. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  778. helm/clients/image_generation/dalle_mini/data.py +442 -0
  779. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  780. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  781. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  782. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  783. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  784. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  785. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  786. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  787. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  788. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  789. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  790. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  791. helm/clients/image_generation/dalle_mini_client.py +191 -0
  792. helm/clients/image_generation/deep_floyd_client.py +80 -0
  793. helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  794. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  795. helm/clients/image_generation/lexica_client.py +88 -0
  796. helm/clients/image_generation/mindalle/__init__.py +0 -0
  797. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  798. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  799. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  800. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  801. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  802. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  803. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  804. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  805. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  806. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  807. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  808. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  809. helm/clients/image_generation/mindalle_client.py +116 -0
  810. helm/clients/image_generation/nudity_check_client.py +64 -0
  811. helm/clients/image_generation/together_image_generation_client.py +113 -0
  812. helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
  813. helm/{proxy/clients → clients}/megatron_client.py +7 -5
  814. helm/clients/mistral_client.py +180 -0
  815. helm/clients/moderation_api_client.py +111 -0
  816. helm/clients/nvidia_nim_client.py +32 -0
  817. helm/clients/open_lm_client.py +43 -0
  818. helm/clients/openai_client.py +604 -0
  819. helm/clients/openai_responses_client.py +200 -0
  820. helm/clients/openrouter_client.py +31 -0
  821. helm/{proxy/clients → clients}/palmyra_client.py +31 -14
  822. helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
  823. helm/clients/reka_client.py +190 -0
  824. helm/clients/simple_client.py +64 -0
  825. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  826. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  827. helm/clients/stanfordhealthcare_google_client.py +43 -0
  828. helm/clients/stanfordhealthcare_http_model_client.py +95 -0
  829. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  830. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  831. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  832. helm/clients/test_client.py +98 -0
  833. helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
  834. helm/clients/test_openrouter_client.py +69 -0
  835. helm/clients/test_simple_client.py +19 -0
  836. helm/clients/test_together_client.py +184 -0
  837. helm/clients/together_client.py +599 -0
  838. helm/clients/upstage_client.py +23 -0
  839. helm/clients/vertexai_client.py +488 -0
  840. helm/clients/vision_language/__init__.py +0 -0
  841. helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  842. helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  843. helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
  844. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  845. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  846. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  847. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  848. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  849. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  850. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  851. helm/clients/vision_language/open_flamingo_client.py +155 -0
  852. helm/clients/vision_language/paligemma_client.py +147 -0
  853. helm/clients/vision_language/palmyra_vision_client.py +101 -0
  854. helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  855. helm/clients/vision_language/qwen_vlm_client.py +174 -0
  856. helm/clients/vllm_client.py +80 -0
  857. helm/clients/vllm_granite_thinking_client.py +56 -0
  858. helm/clients/writer_client.py +105 -0
  859. helm/clients/yi_client.py +28 -0
  860. helm/common/audio_utils.py +111 -0
  861. helm/common/cache.py +23 -33
  862. helm/common/cache_backend_config.py +47 -0
  863. helm/common/clip_score_request.py +41 -0
  864. helm/common/context.py +80 -0
  865. helm/common/credentials_utils.py +5 -5
  866. helm/common/critique_request.py +10 -2
  867. helm/common/file_caches/__init__.py +0 -0
  868. helm/common/file_caches/file_cache.py +16 -0
  869. helm/common/file_caches/local_file_cache.py +61 -0
  870. helm/common/file_caches/test_local_file_cache.py +25 -0
  871. helm/common/file_upload_request.py +27 -0
  872. helm/common/general.py +10 -3
  873. helm/common/hierarchical_logger.py +124 -12
  874. helm/common/image_generation_parameters.py +25 -0
  875. helm/common/images_utils.py +60 -5
  876. helm/common/key_value_store.py +41 -10
  877. helm/common/local_context.py +140 -0
  878. helm/common/media_object.py +14 -1
  879. helm/common/moderations_api_request.py +71 -0
  880. helm/common/mongo_key_value_store.py +8 -7
  881. helm/common/multimodal_request_utils.py +57 -0
  882. helm/common/nudity_check_request.py +29 -0
  883. helm/common/object_spec.py +23 -8
  884. helm/common/optional_dependencies.py +1 -1
  885. helm/common/reeval_parameters.py +12 -0
  886. helm/common/remote_context.py +61 -0
  887. helm/common/request.py +45 -19
  888. helm/common/response_format.py +18 -0
  889. helm/common/test_cache.py +1 -48
  890. helm/common/test_general.py +10 -0
  891. helm/common/test_logging.py +94 -0
  892. helm/common/test_media_object.py +1 -1
  893. helm/common/tokenization_request.py +1 -10
  894. helm/config/model_deployments.yaml +4713 -1005
  895. helm/config/model_metadata.yaml +4045 -255
  896. helm/config/tokenizer_configs.yaml +1091 -50
  897. helm/proxy/accounts.py +31 -4
  898. helm/proxy/cli.py +6 -4
  899. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  900. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  901. helm/proxy/critique/model_critique_client.py +40 -10
  902. helm/proxy/example_queries.py +33 -28
  903. helm/proxy/retry.py +5 -0
  904. helm/proxy/server.py +82 -18
  905. helm/proxy/services/remote_service.py +32 -7
  906. helm/proxy/services/server_service.py +71 -69
  907. helm/proxy/services/service.py +30 -6
  908. helm/proxy/services/test_remote_service.py +6 -5
  909. helm/proxy/services/test_service.py +1 -13
  910. helm/proxy/static/help.html +99 -0
  911. helm/proxy/static/index.css +61 -0
  912. helm/proxy/static/index.html +40 -0
  913. helm/proxy/static/index.js +462 -0
  914. helm/proxy/test_accounts.py +32 -0
  915. helm/proxy/test_retry.py +1 -1
  916. helm/proxy/token_counters/auto_token_counter.py +37 -37
  917. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  918. helm/proxy/token_counters/token_counter.py +3 -5
  919. helm/tokenizers/__init__.py +0 -0
  920. helm/tokenizers/ai21_tokenizer.py +52 -0
  921. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
  922. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
  923. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
  924. helm/tokenizers/cohere_tokenizer.py +50 -0
  925. helm/tokenizers/grok_tokenizer.py +55 -0
  926. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
  927. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
  928. helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
  929. helm/tokenizers/simple_tokenizer.py +33 -0
  930. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  931. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
  932. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  933. helm/tokenizers/test_grok_tokenizer.py +33 -0
  934. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
  935. helm/tokenizers/test_simple_tokenizer.py +33 -0
  936. helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
  937. helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
  938. helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
  939. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  940. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
  941. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  942. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  943. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  944. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  945. crfm_helm-0.4.0.dist-info/METADATA +0 -264
  946. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  947. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  948. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  949. helm/benchmark/data_overlap/light_scenario.py +0 -60
  950. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  951. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  952. helm/benchmark/run_specs.py +0 -2762
  953. helm/benchmark/scenarios/numeracy_scenario.py +0 -784
  954. helm/benchmark/static/benchmarking.css +0 -156
  955. helm/benchmark/static/benchmarking.js +0 -1705
  956. helm/benchmark/static/config.js +0 -3
  957. helm/benchmark/static/images/helm-logo.png +0 -0
  958. helm/benchmark/static/images/language-model-helm.png +0 -0
  959. helm/benchmark/static/images/organizations/ai21.png +0 -0
  960. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  961. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  962. helm/benchmark/static/images/organizations/cohere.png +0 -0
  963. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  964. helm/benchmark/static/images/organizations/google.png +0 -0
  965. helm/benchmark/static/images/organizations/meta.png +0 -0
  966. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  967. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  968. helm/benchmark/static/images/organizations/openai.png +0 -0
  969. helm/benchmark/static/images/organizations/together.png +0 -0
  970. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  971. helm/benchmark/static/images/organizations/yandex.png +0 -0
  972. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  973. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  974. helm/benchmark/static/index.html +0 -68
  975. helm/benchmark/static/json-urls.js +0 -69
  976. helm/benchmark/static/plot-captions.js +0 -27
  977. helm/benchmark/static/utils.js +0 -285
  978. helm/benchmark/test_model_deployment_definition.py +0 -92
  979. helm/benchmark/test_model_properties.py +0 -1570
  980. helm/benchmark/vlm_run_specs.py +0 -97
  981. helm/benchmark/window_services/ai21_window_service.py +0 -258
  982. helm/benchmark/window_services/cohere_window_service.py +0 -163
  983. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  984. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  985. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  986. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  987. helm/benchmark/window_services/t511b_window_service.py +0 -30
  988. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  989. helm/benchmark/window_services/test_cohere_window_service.py +0 -74
  990. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  991. helm/benchmark/window_services/test_ice_window_service.py +0 -326
  992. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  993. helm/benchmark/window_services/ul2_window_service.py +0 -30
  994. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  995. helm/common/cache_utils.py +0 -14
  996. helm/proxy/clients/aleph_alpha_client.py +0 -95
  997. helm/proxy/clients/goose_ai_client.py +0 -99
  998. helm/proxy/clients/microsoft_client.py +0 -180
  999. helm/proxy/clients/openai_client.py +0 -206
  1000. helm/proxy/clients/simple_client.py +0 -60
  1001. helm/proxy/clients/test_client.py +0 -49
  1002. helm/proxy/clients/test_together_client.py +0 -97
  1003. helm/proxy/clients/together_client.py +0 -334
  1004. helm/proxy/clients/vertexai_client.py +0 -115
  1005. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  1006. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  1007. helm/proxy/token_counters/free_token_counter.py +0 -12
  1008. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  1009. helm/proxy/token_counters/openai_token_counter.py +0 -22
  1010. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  1011. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  1012. helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
  1013. helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
  1014. helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
  1015. helm/proxy/tokenizers/ice_tokenizer.py +0 -30
  1016. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  1017. helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
  1018. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  1019. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
  1020. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  1021. /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
  1022. /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
  1023. /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
  1024. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  1025. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  1026. /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
  1027. /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
  1028. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  1029. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  1030. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  1031. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  1032. /helm/{benchmark → proxy}/static/general.js +0 -0
  1033. /helm/{benchmark → proxy}/static/info-icon.png +0 -0
@@ -0,0 +1,2295 @@
1
+ import datasets
2
+ import os
3
+ import random
4
+ from typing import List, Dict
5
+
6
+ import pandas as pd
7
+
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
9
+ from helm.benchmark.scenarios.scenario import (
10
+ Input,
11
+ Instance,
12
+ Output,
13
+ PassageQuestionInput,
14
+ Reference,
15
+ Scenario,
16
+ CORRECT_TAG,
17
+ TEST_SPLIT,
18
+ TRAIN_SPLIT,
19
+ ScenarioMetadata,
20
+ )
21
+ from helm.common.general import ensure_file_downloaded
22
+ from helm.common.hierarchical_logger import hlog
23
+
24
+ # SEA-HELM Scenarios
25
+ # A. Natural Language Understanding
26
+ # B. Natural Language Generation
27
+ # C. Natural Language Reasoning
28
+ # D. Linguistic Diagnostics
29
+
30
+ # A. Natural Language Understanding
31
+ # 1. Question Answering
32
+ # 2. Sentiment Analysis
33
+ # 3. Toxicity Detection/Classification
34
+
35
+
36
+ # 1. Question Answering
37
+ # 1.1 Indonesian: TyDiQA
38
+ class TyDiQAScenario(Scenario):
39
+ """
40
+ TyDiQA is is an open-book question answering scenario for 11 typologically-diverse languages.
41
+ The questions are written by people who want to know the answer, but do not know the answer yet,
42
+ and the data is collected directly in each language without the use of translation.
43
+
44
+ This scenario only uses the Indonesian subset of the data, and uses the Gold Passage (GoldP) task,
45
+ which requires the tested system to extract a span from the given passage to answer a given question.
46
+ There are no unanswerable questions.
47
+
48
+ The models are prompted using the following format:
49
+
50
+ Anda akan diberikan sebuah paragraf dan sebuah pertanyaan. Jawablah pertanyaannya dengan mengekstrak jawaban
51
+ dari paragraf tersebut.
52
+
53
+ Paragraf: <text>
54
+ Pertanyaan: <question>
55
+ Jawaban: <answer>
56
+
57
+ ...
58
+
59
+ Paragraf: <text>
60
+ Pertanyaan: <question>
61
+ Jawaban:
62
+
63
+
64
+ Target completion:
65
+ <answer>
66
+
67
+ @article{clark-etal-2020-tydi,
68
+ title = "{T}y{D}i {QA}: A Benchmark for Information-Seeking Question Answering in Typologically
69
+ Diverse Languages",
70
+ author = "Clark, Jonathan H. and
71
+ Choi, Eunsol and
72
+ Collins, Michael and
73
+ Garrette, Dan and
74
+ Kwiatkowski, Tom and
75
+ Nikolaev, Vitaly and
76
+ Palomaki, Jennimaria",
77
+ editor = "Johnson, Mark and
78
+ Roark, Brian and
79
+ Nenkova, Ani",
80
+ journal = "Transactions of the Association for Computational Linguistics",
81
+ volume = "8",
82
+ year = "2020",
83
+ address = "Cambridge, MA",
84
+ publisher = "MIT Press",
85
+ url = "https://aclanthology.org/2020.tacl-1.30",
86
+ doi = "10.1162/tacl_a_00317",
87
+ pages = "454--470",
88
+ }
89
+ """
90
+
91
+ name = "tydiqa"
92
+ description = "Indonesian Open-book Question Answering task"
93
+ tags = ["question_answering"]
94
+
95
+ def __init__(self):
96
+ super().__init__()
97
+ self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
98
+
99
+ def get_instances(self, output_path) -> List[Instance]:
100
+ dataset = datasets.load_dataset(
101
+ "khalidalt/tydiqa-goldp",
102
+ "indonesian",
103
+ revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
104
+ trust_remote_code=True,
105
+ )
106
+
107
+ outputs = []
108
+ for split in self.splits.keys():
109
+ df = dataset[split].to_pandas()
110
+
111
+ if split == "train":
112
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
113
+ data = df[df["passage_text"].apply(len) < df["passage_text"].apply(len).quantile(0.2)]
114
+ else:
115
+ data = df
116
+
117
+ for _, row in data.iterrows():
118
+ passage = row["passage_text"].strip()
119
+ question = row["question_text"].strip()
120
+ input = PassageQuestionInput(
121
+ passage=passage,
122
+ question=question,
123
+ passage_prefix="Paragraf: ",
124
+ question_prefix="Pertanyaan: ",
125
+ )
126
+ references = []
127
+ for answer in row["answers"]["text"]:
128
+ output = Output(text=answer.strip())
129
+ references.append(Reference(output, tags=[CORRECT_TAG]))
130
+ instance = Instance(input=input, references=references, split=self.splits[split])
131
+ outputs.append(instance)
132
+ return outputs
133
+
134
+ def get_metadata(self) -> ScenarioMetadata:
135
+ return ScenarioMetadata(
136
+ name="tydiqa",
137
+ display_name="TyDiQA",
138
+ short_display_name=None,
139
+ description="TyDiQA [(Clark, 2020)](https://aclanthology.org/2020.tacl-1.30) is an "
140
+ "open-book question answering dataset for 11 typologically-diverse languages. "
141
+ "The questions are written by people who want to know the answer, but do not "
142
+ "know the answer yet, and the data is collected directly in each language "
143
+ "without the use of translation.\n",
144
+ taxonomy=TaxonomyInfo(
145
+ task="question answering",
146
+ what="questions by human annotators about Wikipedia articles",
147
+ when="?",
148
+ who="human annotators",
149
+ language="Indonesian",
150
+ ),
151
+ main_metric="squad_f1_score",
152
+ main_split="test",
153
+ )
154
+
155
+
156
+ # 1.2 Vietnamese & Thai: XQuAD
157
+ class XQuADScenario(Scenario):
158
+ """
159
+ XQuAD is an open-book question answering scenario that is parallel across 10 languages.
160
+ The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the
161
+ development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
162
+
163
+ This scenario only uses the Vietnamese and Thai subsets of the data and there are no
164
+ unanswerable questions.
165
+
166
+ The models are prompted using the following general format:
167
+
168
+ You will be given a paragraph and a question. Answer the question by extracting the answer from the paragraph.
169
+
170
+ Paragraph: <text>
171
+ Question: <question>
172
+ Answer: <answer>
173
+
174
+ ...
175
+
176
+ Paragraph: <text>
177
+ Question: <question>
178
+ Answer:
179
+
180
+ Target completion:
181
+ <answer>
182
+
183
+ @article{Artetxe:etal:2019,
184
+ author = {Mikel Artetxe and Sebastian Ruder and Dani Yogatama},
185
+ title = {On the cross-lingual transferability of monolingual representations},
186
+ journal = {CoRR},
187
+ volume = {abs/1910.11856},
188
+ year = {2019},
189
+ archivePrefix = {arXiv},
190
+ eprint = {1910.11856}
191
+ }
192
+ """
193
+
194
+ name = "xquad"
195
+ description = "Vietnamese and Thai Open-book Question Answering task"
196
+ tags = ["question_answering"]
197
+
198
+ def __init__(self, language: str):
199
+ super().__init__()
200
+ self.language = language
201
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
202
+ self.language_to_prompt_components = {
203
+ "th": {
204
+ "passage_prefix": "ข้อความ: ",
205
+ "question_prefix": "คำถาม: ",
206
+ "random_state": 4520,
207
+ },
208
+ "vi": {
209
+ "passage_prefix": "Đoạn văn: ",
210
+ "question_prefix": "Câu hỏi: ",
211
+ "random_state": 4502,
212
+ },
213
+ }
214
+ if self.language not in self.language_to_prompt_components.keys():
215
+ raise Exception(
216
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
217
+ )
218
+ else:
219
+ self.prompt_components = self.language_to_prompt_components[self.language]
220
+
221
+ def get_instances(self, output_path) -> List[Instance]:
222
+ dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
223
+ df = dataset.to_pandas()
224
+
225
+ # Sample 1000 examples for test
226
+ df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
227
+
228
+ # In-context examples to be drawn from remaining examples (since there is no train data)
229
+ df_train = df[~df.index.isin(df_test.index)]
230
+
231
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
232
+ df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
233
+ dataset = {
234
+ "train": df_train,
235
+ "test": df_test,
236
+ }
237
+
238
+ outputs = []
239
+ for split in self.splits.keys():
240
+ data = dataset[split]
241
+ for _, row in data.iterrows():
242
+ passage = row["context"].strip()
243
+ question = row["question"].strip()
244
+ input = PassageQuestionInput(
245
+ passage=passage,
246
+ question=question,
247
+ passage_prefix=str(self.prompt_components["passage_prefix"]),
248
+ question_prefix=str(self.prompt_components["question_prefix"]),
249
+ )
250
+ references = []
251
+ for answer in row["answers"]["text"]:
252
+ output = Output(text=answer.strip())
253
+ references.append(Reference(output, tags=[CORRECT_TAG]))
254
+ instance = Instance(input=input, references=references, split=self.splits[split])
255
+ outputs.append(instance)
256
+ return outputs
257
+
258
+ def get_metadata(self) -> ScenarioMetadata:
259
+ return ScenarioMetadata(
260
+ name=f"xquad_{self.language}",
261
+ display_name=f"XQuAD ({self.language})",
262
+ short_display_name=None,
263
+ description="XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book "
264
+ "question answering dataset that is parallel across 10 languages. The dataset "
265
+ "consists of a subset of 240 paragraphs and 1190 question-answer pairs from the "
266
+ "development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their "
267
+ "professional translations.\n",
268
+ taxonomy=TaxonomyInfo(
269
+ task="question answering",
270
+ what="questions by crowdworkers about Wikipedia articles translated "
271
+ f"from English to {self.language}",
272
+ when="?",
273
+ who="?",
274
+ language=self.language,
275
+ ),
276
+ main_metric="squad_f1_score",
277
+ main_split="test",
278
+ )
279
+
280
+
281
+ # 1.3 Tamil: IndicQA
282
+ class IndicQAScenario(Scenario):
283
+ """
284
+ IndicQA is an open-book question answering scenario for 11 Indic languages.
285
+ Answers to questions are to be extracted from the text provided. The data is taken from
286
+ Wikipedia articles across various domains and questions and answers were manually created
287
+ by native speakers.
288
+
289
+ This scenario only uses the Tamil subset of the data and unanswerable questions
290
+ are removed from the dataset in order to be consistent with the question answering
291
+ scenarios for Indonesian, Vietnamese and Thai.
292
+
293
+ The models are prompted using the following format:
294
+
295
+ உங்களுக்கு ஒரு பத்தியும் ஒரு கேள்வியும் தரப்படும். தரப்பட்ட பத்தியிலிருந்து கேள்விக்கான பதிலைக் கண்டறியவும்.
296
+
297
+ பத்தி: <text>
298
+ கேள்வி: <question>
299
+ பதில்: <answer>
300
+
301
+ ...
302
+
303
+ பத்தி: <text>
304
+ கேள்வி: <question>
305
+ பதில்:
306
+
307
+ Target completion:
308
+ <answer>
309
+
310
+ @inproceedings{doddapaneni-etal-2023-towards,
311
+ title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
312
+ {I}ndic Languages",
313
+ author = "Doddapaneni, Sumanth and
314
+ Aralikatte, Rahul and
315
+ Ramesh, Gowtham and
316
+ Goyal, Shreya and
317
+ Khapra, Mitesh M. and
318
+ Kunchukuttan, Anoop and
319
+ Kumar, Pratyush",
320
+ editor = "Rogers, Anna and
321
+ Boyd-Graber, Jordan and
322
+ Okazaki, Naoaki",
323
+ booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
324
+ Long Papers)",
325
+ month = jul,
326
+ year = "2023",
327
+ address = "Toronto, Canada",
328
+ publisher = "Association for Computational Linguistics",
329
+ url = "https://aclanthology.org/2023.acl-long.693",
330
+ doi = "10.18653/v1/2023.acl-long.693",
331
+ pages = "12402--12426",
332
+ }
333
+ """
334
+
335
+ name = "indicqa"
336
+ description = "Tamil Open-book Question Answering task"
337
+ tags = ["question_answering"]
338
+
339
+ def __init__(self):
340
+ super().__init__()
341
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
342
+
343
+ def get_instances(self, output_path) -> List[Instance]:
344
+ dataset = datasets.load_dataset(
345
+ "ai4bharat/IndicQA",
346
+ "indicqa.ta",
347
+ split="test",
348
+ revision="78ee8d58e880c72f324e176c989dfefa55427af4",
349
+ trust_remote_code=True,
350
+ )
351
+ df = dataset.to_pandas()
352
+
353
+ # Remove unanswerable questions (answer is an empty string)
354
+ df = df[df["answers"].apply(lambda x: len(x["text"][0].strip()) > 0)]
355
+
356
+ # Sample 1000 examples for test
357
+ df_test = df.sample(n=1000, random_state=7900)
358
+
359
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
360
+ df_train = df[~df.index.isin(df_test.index)]
361
+
362
+ # Select only bottom 20th percentile by length for in-context examples as examples are very long
363
+ df_train = df_train[df_train["context"].apply(len) < df_train["context"].apply(len).quantile(0.2)]
364
+ dataset = {
365
+ "train": df_train,
366
+ "test": df_test,
367
+ }
368
+
369
+ outputs = []
370
+ for split in self.splits.keys():
371
+ data = dataset[split]
372
+ for _, row in data.iterrows():
373
+ passage = row["context"].strip()
374
+ question = row["question"].strip()
375
+ input = PassageQuestionInput(
376
+ passage=passage,
377
+ question=question,
378
+ passage_prefix="பத்தி: ",
379
+ question_prefix="கேள்வி: ",
380
+ )
381
+ references = []
382
+ for answer in row["answers"]["text"]:
383
+ output = Output(text=answer.strip())
384
+ references.append(Reference(output, tags=[CORRECT_TAG]))
385
+ instance = Instance(input=input, references=references, split=self.splits[split])
386
+ outputs.append(instance)
387
+ return outputs
388
+
389
+ def get_metadata(self) -> ScenarioMetadata:
390
+ return ScenarioMetadata(
391
+ name="indicqa",
392
+ display_name="IndicQA",
393
+ short_display_name=None,
394
+ description="IndicQA [(Doddapaneni, 2023)](https://aclanthology.org/2023.acl-long.693)is an "
395
+ "open-book question answering dataset for 11 Indic languages. Answers to "
396
+ "questions are to be extracted from the text provided. The data is taken from "
397
+ "Wikipedia articles across various domains and questions and answers were "
398
+ "manually created by native speakers.\n",
399
+ taxonomy=TaxonomyInfo(
400
+ task="question answering",
401
+ what="questions about Wikipedia articles translated by native " "speakers from English to Tamil",
402
+ when="?",
403
+ who="?",
404
+ language="Tamil",
405
+ ),
406
+ main_metric="squad_f1_score",
407
+ main_split="test",
408
+ )
409
+
410
+
411
+ # 2. Sentiment Analysis
412
+ # 2.1 Indonesian: NusaX Sentiment
413
+ class NusaXScenario(Scenario):
414
+ """
415
+ NusaX is a sentiment analysis scenario for 11 Indonesian languages.
416
+ The data is derived from a subset of SmSA (Purwarianti and Crisdayanti, 2019) and manually translated
417
+ from Indonesian to 10 other local languages, such as Acehnese and Toba Batak.
418
+ It consists of comments and reviews from various online platforms.
419
+
420
+ Only the Indonesian subset of the data is used for this scenario, and the labels are
421
+ positive, negative or neutral.
422
+
423
+ The models are prompted using the following format:
424
+
425
+ Apa sentimen dari kalimat berikut ini?
426
+ Jawablah dengan satu kata saja:
427
+ - Positif
428
+ - Negatif
429
+ - Netral
430
+
431
+ Kalimat: <text>
432
+ Jawaban: <sentiment>
433
+
434
+ ...
435
+
436
+ Kalimat: <text>
437
+ Jawaban:
438
+
439
+ Target completion:
440
+ <sentiment>
441
+
442
+ @inproceedings{winata-etal-2023-nusax,
443
+ title = "{N}usa{X}: Multilingual Parallel Sentiment Dataset for 10 {I}ndonesian Local Languages",
444
+ author = "Winata, Genta Indra and
445
+ Aji, Alham Fikri and
446
+ Cahyawijaya, Samuel and
447
+ Mahendra, Rahmad and
448
+ Koto, Fajri and
449
+ Romadhony, Ade and
450
+ Kurniawan, Kemal and
451
+ Moeljadi, David and
452
+ Prasojo, Radityo Eko and
453
+ Fung, Pascale and
454
+ Baldwin, Timothy and
455
+ Lau, Jey Han and
456
+ Sennrich, Rico and
457
+ Ruder, Sebastian",
458
+ editor = "Vlachos, Andreas and
459
+ Augenstein, Isabelle",
460
+ booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for
461
+ Computational Linguistics",
462
+ month = may,
463
+ year = "2023",
464
+ address = "Dubrovnik, Croatia",
465
+ publisher = "Association for Computational Linguistics",
466
+ url = "https://aclanthology.org/2023.eacl-main.57",
467
+ doi = "10.18653/v1/2023.eacl-main.57",
468
+ pages = "815--834",
469
+ }
470
+ """
471
+
472
+ name = "nusax"
473
+ description = "Indonesian NusaX-Senti Sentiment Analysis dataset"
474
+ tags = ["sentiment_analysis"]
475
+
476
+ def __init__(self):
477
+ super().__init__()
478
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
479
+ self.sentiment2label = {
480
+ "positive": "Positif",
481
+ "negative": "Negatif",
482
+ "neutral": "Netral",
483
+ }
484
+
485
+ def download_dataset(self, output_path: str):
486
+ URLS = {
487
+ "test": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/test.csv",
488
+ "train": "https://raw.githubusercontent.com/IndoNLP/nusax/main/datasets/sentiment/indonesian/train.csv",
489
+ }
490
+
491
+ dataset: Dict[str, pd.DataFrame] = {}
492
+ for split in self.splits.keys():
493
+ target_path_file = os.path.join(output_path, split)
494
+ ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
495
+ data = pd.read_csv(target_path_file)
496
+ dataset[split] = data
497
+ return dataset
498
+
499
+ def get_instances(self, output_path) -> List[Instance]:
500
+ dataset = self.download_dataset(output_path)
501
+ outputs = []
502
+ for split in self.splits.keys():
503
+ data = dataset[split]
504
+ for _, row in data.iterrows():
505
+ input = Input(row["text"].strip())
506
+ output = Output(text=self.sentiment2label[row["label"]])
507
+ references = [
508
+ Reference(output, tags=[CORRECT_TAG]),
509
+ ]
510
+ instance = Instance(input=input, references=references, split=self.splits[split])
511
+ outputs.append(instance)
512
+ return outputs
513
+
514
+ def get_metadata(self) -> ScenarioMetadata:
515
+ return ScenarioMetadata(
516
+ name="nusax",
517
+ display_name="NusaX",
518
+ short_display_name=None,
519
+ description="NusaX [(Winata, 2023)](https://aclanthology.org/2023.eacl-main.57) is an "
520
+ "Indonesian sentiment analysis dataset. The data consists of comments and "
521
+ "reviews from various online platforms.\n",
522
+ taxonomy=TaxonomyInfo(
523
+ task="sentiment analysis",
524
+ what="online comments and reviews",
525
+ when="?",
526
+ who="internet users",
527
+ language="Indonesian",
528
+ ),
529
+ main_metric="classification_macro_f1",
530
+ main_split="test",
531
+ )
532
+
533
+
534
+ # 2.2 Vietnamese: UIT-VSFC
535
+ class UITVSFCScenario(Scenario):
536
+ """
537
+ UIT-VSFC is a Vietnamese sentiment analysis scenario. The data consists of student feedback obtained from
538
+ end-of-semester surveys at a Vietnamese university. Feedback is labeled as one of three sentiment
539
+ polarities: positive, negative or neutral.
540
+
541
+ The models are prompted using the following format:
542
+
543
+ Sắc thái của câu sau đây là gì?
544
+ Trả lời với một từ duy nhất:
545
+ - Tích cực
546
+ - Tiêu cực
547
+ - Trung lập
548
+
549
+ Câu văn: <text>
550
+ Câu trả lời: <sentiment>
551
+
552
+ ...
553
+
554
+ Câu văn: <text>
555
+ Câu trả lời:
556
+
557
+ Target completion:
558
+ <sentiment>
559
+
560
+ @inproceedings{van2018uit,
561
+ title={UIT-VSFC: Vietnamese students’ feedback corpus for sentiment analysis},
562
+ author={Van Nguyen, Kiet and Nguyen, Vu Duc and Nguyen, Phu XV and Truong, Tham TH and Nguyen, Ngan Luu-Thuy},
563
+ booktitle={2018 10th international conference on knowledge and systems engineering (KSE)},
564
+ pages={19--24},
565
+ year={2018},
566
+ organization={IEEE},
567
+ url={https://ieeexplore.ieee.org/document/8573337},
568
+ }
569
+ """
570
+
571
+ name = "uitvsfc"
572
+ description = "Vietnamese Students' Feedback Corpus sentiment analysis task"
573
+ tags = ["sentiment_analysis"]
574
+
575
+ def __init__(self):
576
+ super().__init__()
577
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
578
+ self.id2label = {
579
+ 0: "Tiêu cực",
580
+ 1: "Trung lập",
581
+ 2: "Tích cực",
582
+ }
583
+
584
+ def download_dataset(self, output_path: str):
585
+ URLS = {
586
+ "train": {
587
+ "sentences": "https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download",
588
+ "sentiments": "https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download",
589
+ },
590
+ "test": {
591
+ "sentences": "https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download",
592
+ "sentiments": "https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download",
593
+ },
594
+ }
595
+
596
+ dataset: Dict[str, pd.DataFrame] = {}
597
+ for split in list(URLS.keys()):
598
+ file_lines: Dict[str, List[str]] = {}
599
+ for file in list(URLS[split].keys()):
600
+ file_lines[file] = []
601
+ target_path_file = os.path.join(output_path, split, file)
602
+ ensure_file_downloaded(source_url=URLS[split][file], target_path=target_path_file)
603
+ with open(target_path_file, "r") as f:
604
+ lines = f.readlines()
605
+ for line in lines:
606
+ file_lines[file].append(str(line).strip())
607
+ df = pd.DataFrame({"text": file_lines["sentences"], "label": file_lines["sentiments"]})
608
+ if split == "test":
609
+ dataset[split] = df.groupby("label", group_keys=False).apply(
610
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
611
+ )
612
+ else:
613
+ dataset[split] = df
614
+ return dataset
615
+
616
+ def get_instances(self, output_path) -> List[Instance]:
617
+ dataset = self.download_dataset(output_path)
618
+ outputs = []
619
+ for split in self.splits.keys():
620
+ data = dataset[split]
621
+ for _, row in data.iterrows():
622
+ input = Input(row["text"])
623
+ output = Output(text=self.id2label[int(row["label"])])
624
+ references = [
625
+ Reference(output, tags=[CORRECT_TAG]),
626
+ ]
627
+ instance = Instance(input=input, references=references, split=self.splits[split])
628
+ outputs.append(instance)
629
+ return outputs
630
+
631
+ def get_metadata(self) -> ScenarioMetadata:
632
+ return ScenarioMetadata(
633
+ name="uitvsfc",
634
+ display_name="UIT-VSFC",
635
+ short_display_name=None,
636
+ description="UIT-VSFC [(Nguyen, 2018)](https://ieeexplore.ieee.org/document/8573337) is a "
637
+ "Vietnamese sentiment analysis dataset. The data consists of student feedback "
638
+ "obtained from end-of-semester surveys at a Vietnamese university.\n",
639
+ taxonomy=TaxonomyInfo(
640
+ task="sentiment analysis",
641
+ what="university student end-of-semester survey responses",
642
+ when="?",
643
+ who="university students",
644
+ language="Vietnamese",
645
+ ),
646
+ main_metric="classification_macro_f1",
647
+ main_split="test",
648
+ )
649
+
650
+
651
+ # 2.3 Thai: Wisesight Sentiment
652
+ class WisesightScenario(Scenario):
653
+ """
654
+ Wisesight Sentiment is a Thai sentiment analysis scenario. The data consists of social media messages
655
+ regarding consumer products and services.
656
+
657
+ The dataset originally included the label "question" for instances that were questions. These instances
658
+ made up only a small subset of the data and were dropped in order to make the task more consistent
659
+ with those of other languages. Labels are therefore only positive, negative or neutral.
660
+
661
+ The models are prompted using the following format:
662
+
663
+ อารมณ์ความรู้สึกของข้อความต่อไปนี้เป็นอย่างไร?
664
+ กรุณาตอบโดยใช้คำเดียวเท่านั้น:
665
+ - แง่บวก
666
+ - แง่ลบ
667
+ - เฉยๆ
668
+
669
+ ข้อความ: <text>
670
+ คำตอบ: <sentiment>
671
+
672
+ ...
673
+
674
+ ข้อความ: <text>
675
+ คำตอบ:
676
+
677
+ Target completion:
678
+ <sentiment>
679
+
680
+ @software{bact_2019_3457447,
681
+ author = {Suriyawongkul, Arthit and
682
+ Chuangsuwanich, Ekapol and
683
+ Chormai, Pattarawat and
684
+ Polpanumas, Charin},
685
+ title = {PyThaiNLP/wisesight-sentiment: First release},
686
+ month = sep,
687
+ year = 2019,
688
+ publisher = {Zenodo},
689
+ version = {v1.0},
690
+ doi = {10.5281/zenodo.3457447},
691
+ url = {https://doi.org/10.5281/zenodo.3457447}
692
+ }
693
+ """
694
+
695
+ name = "wisesight"
696
+ description = "Wisesight Sentiment Thai sentiment analysis task"
697
+ tags = ["sentiment_analysis"]
698
+
699
+ def __init__(self):
700
+ super().__init__()
701
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
702
+ self.sentiment2label = {
703
+ "pos": "แง่บวก",
704
+ "neg": "แง่ลบ",
705
+ "neu": "เฉยๆ",
706
+ }
707
+
708
+ def download_dataset(self, output_path: str):
709
+ URL = "https://github.com/PyThaiNLP/wisesight-sentiment/raw/master/huggingface/data.zip"
710
+ data_path = os.path.join(output_path, "data")
711
+ ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
712
+
713
+ dataset: Dict[str, pd.DataFrame] = {}
714
+ for split in self.splits.keys():
715
+ target_path_file = os.path.join(data_path, "data", f"{split}.jsonl")
716
+ df = pd.read_json(target_path_file, lines=True)
717
+ df = df[df["category"] != "q"] # Drop instances with the "question" label
718
+ if split == "test":
719
+ dataset[split] = df.groupby("category", group_keys=False).apply(
720
+ lambda x: x.sample(frac=1000 / len(df), random_state=4183)
721
+ )
722
+ else:
723
+ dataset[split] = df
724
+ return dataset
725
+
726
+ def get_instances(self, output_path) -> List[Instance]:
727
+ dataset = self.download_dataset(output_path)
728
+ outputs = []
729
+ for split in self.splits.keys():
730
+ data = dataset[split]
731
+ for _, row in data.iterrows():
732
+ input = Input(row["texts"].strip())
733
+ output = Output(text=self.sentiment2label[row["category"]])
734
+ references = [
735
+ Reference(output, tags=[CORRECT_TAG]),
736
+ ]
737
+ instance = Instance(input=input, references=references, split=self.splits[split])
738
+ outputs.append(instance)
739
+ return outputs
740
+
741
+ def get_metadata(self) -> ScenarioMetadata:
742
+ return ScenarioMetadata(
743
+ name="wisesight",
744
+ display_name="Wisesight",
745
+ short_display_name=None,
746
+ description="Wisesight [(Suriyawongkul, 2019)](https://doi.org/10.5281/zenodo.3457447) is "
747
+ "an Thai sentiment analysis scenario. The data consists of social media "
748
+ "messages regarding consumer products and services. \n",
749
+ taxonomy=TaxonomyInfo(
750
+ task="sentiment analysis",
751
+ what="social media messages regarding consumer products and services",
752
+ when="?",
753
+ who="social media users",
754
+ language="Thai",
755
+ ),
756
+ main_metric="classification_macro_f1",
757
+ main_split="test",
758
+ )
759
+
760
+
761
+ # 2.4 Tamil: IndicSentiment
762
+ class IndicSentimentScenario(Scenario):
763
+ """
764
+ IndicSentiment is a sentiment analysis scenario for 10 Indic languages. The data consists of
765
+ product reviews written in English that were then translated by native speakers of the
766
+ respective languages, resulting in a parallel dataset across the 10 languages.
767
+
768
+ Only the Tamil subset of the dataset is used for this scenario. Labels are positive or negative.
769
+
770
+ The models are prompted using the following format:
771
+
772
+ பின்வரும் வாக்கியத்தில் வெளிப்படுத்தப்படும் உணர்வு எது?
773
+ ஒரு சொல்லில் மட்டும் பதிலளிக்கவும்:
774
+ - நேர்மறை
775
+ - எதிர்மறை
776
+
777
+ வாக்கியம்: <text>
778
+ பதில்:
779
+
780
+ ...
781
+
782
+ வாக்கியம்: <text>
783
+ பதில்: <answer>
784
+
785
+ Target completion:
786
+ <sentiment> (<sentiment>:positive or negative)
787
+
788
+ @inproceedings{doddapaneni-etal-2023-towards,
789
+ title = "Towards Leaving No {I}ndic Language Behind: Building Monolingual Corpora, Benchmark and Models for
790
+ {I}ndic Languages",
791
+ author = "Doddapaneni, Sumanth and
792
+ Aralikatte, Rahul and
793
+ Ramesh, Gowtham and
794
+ Goyal, Shreya and
795
+ Khapra, Mitesh M. and
796
+ Kunchukuttan, Anoop and
797
+ Kumar, Pratyush",
798
+ editor = "Rogers, Anna and
799
+ Boyd-Graber, Jordan and
800
+ Okazaki, Naoaki",
801
+ booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1:
802
+ Long Papers)",
803
+ month = jul,
804
+ year = "2023",
805
+ address = "Toronto, Canada",
806
+ publisher = "Association for Computational Linguistics",
807
+ url = "https://aclanthology.org/2023.acl-long.693",
808
+ doi = "10.18653/v1/2023.acl-long.693",
809
+ pages = "12402--12426",
810
+ }
811
+ """
812
+
813
+ name = "indicsentiment"
814
+ description = "IndicSentiment Tamil sentiment analysis task"
815
+ tags = ["sentiment_analysis"]
816
+
817
+ def __init__(self):
818
+ super().__init__()
819
+ self.splits = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
820
+ self.sentiment2label = {
821
+ "Positive": "நேர்மறை",
822
+ "Negative": "எதிர்மறை",
823
+ }
824
+
825
+ def get_instances(self, output_path) -> List[Instance]:
826
+ dataset = datasets.load_dataset(
827
+ "ai4bharat/IndicSentiment",
828
+ "translation-ta",
829
+ revision="dc8f3f66886531c6897fedffca1e938a68fc5013",
830
+ trust_remote_code=True,
831
+ )
832
+
833
+ outputs = []
834
+ for split in self.splits.keys():
835
+ data = dataset[split].to_pandas()
836
+ # Current version on HuggingFace datasets has 2 instances without labels across all languages.
837
+ # Confirmed with first author that the labels for these instances should be Positive.
838
+ data["LABEL"] = data["LABEL"].fillna("Positive")
839
+ for _, row in data.iterrows():
840
+ input = Input(row["INDIC REVIEW"].strip())
841
+ output = Output(text=self.sentiment2label[row["LABEL"]])
842
+ references = [
843
+ Reference(output, tags=[CORRECT_TAG]),
844
+ ]
845
+ instance = Instance(input=input, references=references, split=self.splits[split])
846
+ outputs.append(instance)
847
+ return outputs
848
+
849
+ def get_metadata(self) -> ScenarioMetadata:
850
+ return ScenarioMetadata(
851
+ name="indicsentiment",
852
+ display_name="IndicSentiment",
853
+ short_display_name=None,
854
+ description="IndicSentiment is a Tamil sentiment analysis dataset that comes from "
855
+ "IndicXTREME [(Doddapaneni, "
856
+ "2022)](https://aclanthology.org/2023.acl-long.693/), and consists of product "
857
+ "reviews that were written by annotators. Labels are positive or negative.\n",
858
+ taxonomy=TaxonomyInfo(
859
+ task="sentiment analysis", what="product reviews", when="?", who="human annotators", language="Tamil"
860
+ ),
861
+ main_metric="classification_macro_f1",
862
+ main_split="test",
863
+ )
864
+
865
+
866
+ # 3. Toxicity Detection/Classification
867
+ # 3.1 Indonesian: Multi-Label Hate Speech Detection
868
+ class MLHSDScenario(Scenario):
869
+ """
870
+ Multi-Label Hate Speech and Abusive Language Detection (MLHSD) is an Indonesian toxicity
871
+ classification scenario. The data is obtained from Twitter and PII have been anonymized to
872
+ USER and URL.
873
+
874
+ The original dataset was used for a multi-label classification task, but it has been repurposed
875
+ as a multi-class classification task to be more aligned with the task for other languages.
876
+ The mapping is done as follows:
877
+ - Clean: No abusive language or hate speech labels
878
+ - Abusive: Only abusive language label but no hate speech labels
879
+ - Hate: As long as one hate speech label is present
880
+
881
+ The models are prompted using the following format:
882
+
883
+ Anda adalah pendeteksi ujaran kebencian. Definisi dari labelnya adalah sebagai berikut:
884
+ Bersih: Tidak ada ujaran kebencian.
885
+ Kasar: Ada ujaran kebencian dan kata-kata kasar, namun tidak menyerang pihak tertentu.
886
+ Benci: Ada ujaran kebencian atau serangan langsung terhadap pihak tertentu.
887
+ Berdasarkan definisi labelnya, klasifikasikan kalimat berikut ini dengan satu kata saja:
888
+ - Bersih
889
+ - Kasar
890
+ - Benci
891
+
892
+ Kalimat: <text>
893
+ Jawaban: <answer>
894
+
895
+ ...
896
+
897
+ Kalimat: <text>
898
+ Jawaban:
899
+
900
+ Target completion:
901
+ <answer>
902
+
903
+ @inproceedings{ibrohim-budi-2019-multi,
904
+ title = "Multi-label Hate Speech and Abusive Language Detection in {I}ndonesian {T}witter",
905
+ author = "Ibrohim, Muhammad Okky and
906
+ Budi, Indra",
907
+ editor = "Roberts, Sarah T. and
908
+ Tetreault, Joel and
909
+ Prabhakaran, Vinodkumar and
910
+ Waseem, Zeerak",
911
+ booktitle = "Proceedings of the Third Workshop on Abusive Language Online",
912
+ month = aug,
913
+ year = "2019",
914
+ address = "Florence, Italy",
915
+ publisher = "Association for Computational Linguistics",
916
+ url = "https://aclanthology.org/W19-3506",
917
+ doi = "10.18653/v1/W19-3506",
918
+ pages = "46--57",
919
+ }
920
+ """
921
+
922
+ name = "mlhsd"
923
+ description = (
924
+ "Multi-Label Hate Speech and Abusive Language Detection (MLHSD) Indonesian toxicity classification task"
925
+ )
926
+ tags = ["toxicity_detection"]
927
+
928
+ def __init__(self):
929
+ super().__init__()
930
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
931
+
932
+ def download_dataset(self, output_path: str):
933
+ BASE_URL = "https://raw.githubusercontent.com/okkyibrohim/"
934
+ URL = f"{BASE_URL}id-multi-label-hate-speech-and-abusive-language-detection/master/re_dataset.csv"
935
+ target_path_file = os.path.join(output_path, "mlhsd")
936
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
937
+ df = pd.read_csv(target_path_file, encoding="ISO-8859-1")
938
+
939
+ # Map multi-label task to multi-class task
940
+ df["label"] = df.apply(lambda x: self.get_label(x), axis=1)
941
+
942
+ df_test = df.groupby("label", group_keys=False).apply(
943
+ lambda x: x.sample(frac=1000 / len(df), random_state=7123)
944
+ )
945
+
946
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
947
+ df_train = df[~df.index.isin(df_test.index)]
948
+ dataset = {
949
+ "train": df_train,
950
+ "test": df_test,
951
+ }
952
+ return dataset
953
+
954
+ def get_label(self, row) -> str:
955
+ if int(row["HS"]) == 1:
956
+ return "Benci"
957
+ elif int(row["Abusive"]) == 1:
958
+ return "Kasar"
959
+ else:
960
+ return "Bersih"
961
+
962
+ def get_instances(self, output_path) -> List[Instance]:
963
+ dataset = self.download_dataset(output_path)
964
+ outputs = []
965
+ for split in self.splits.keys():
966
+ data = dataset[split]
967
+ for _, row in data.iterrows():
968
+ input = Input(row["Tweet"].strip())
969
+ output = Output(text=row["label"])
970
+ references = [
971
+ Reference(output, tags=[CORRECT_TAG]),
972
+ ]
973
+ instance = Instance(input=input, references=references, split=self.splits[split])
974
+ outputs.append(instance)
975
+ return outputs
976
+
977
+ def get_metadata(self) -> ScenarioMetadata:
978
+ return ScenarioMetadata(
979
+ name="mlhsd",
980
+ display_name="MLHSD",
981
+ short_display_name=None,
982
+ description="MLHSD [(Ibrohim, 2019)](https://aclanthology.org/W19-3506) is an Indonesian "
983
+ "toxicity detection dataset obtained from tweets on Twitter.\n",
984
+ taxonomy=TaxonomyInfo(
985
+ task="toxicity detection/classification",
986
+ what="tweets",
987
+ when="?",
988
+ who="Twitter users",
989
+ language="Indonesian",
990
+ ),
991
+ main_metric="classification_macro_f1",
992
+ main_split="test",
993
+ )
994
+
995
+
996
+ # 3.2 Vietnamese: ViHSD
997
+ class ViHSDScenario(Scenario):
998
+ """
999
+ ViHSD is a Vietnamese toxicity classification scenario. The data is obtained from social media.
1000
+ The labels are Clean, Offensive and Hate.
1001
+
1002
+ The models are prompted using the following format:
1003
+
1004
+ Bạn là máy phát hiện phát ngôn thù ghét. Các nhãn được định nghĩa như sau:
1005
+ Sạch: Không quấy rối.
1006
+ Công kích: Bao gồm quấy rối và thậm chí chửi thề, nhưng không tấn công bất kì đối tượng cụ thể nào.
1007
+ Thù ghét: Trực tiếp quấy rối hay lăng mạ một đối tượng cụ thể.
1008
+ Với các định nghĩa của nhãn, hãy phân loại câu dưới đây với một từ duy nhất:
1009
+ - Sạch
1010
+ - Công kích
1011
+ - Thù ghét
1012
+
1013
+
1014
+ Câu văn: <text>
1015
+ Câu trả lời: <toxicity>
1016
+
1017
+ ...
1018
+
1019
+ Câu văn: <text>
1020
+ Câu trả lời:
1021
+
1022
+ Target completion:
1023
+ <toxicity>
1024
+
1025
+ @InProceedings{10.1007/978-3-030-79457-6_35,
1026
+ author="Luu, Son T.
1027
+ and Nguyen, Kiet Van
1028
+ and Nguyen, Ngan Luu-Thuy",
1029
+ editor="Fujita, Hamido
1030
+ and Selamat, Ali
1031
+ and Lin, Jerry Chun-Wei
1032
+ and Ali, Moonis",
1033
+ title="A Large-Scale Dataset for Hate Speech Detection on Vietnamese Social Media Texts",
1034
+ booktitle="Advances and Trends in Artificial Intelligence. Artificial Intelligence Practices",
1035
+ year="2021",
1036
+ publisher="Springer International Publishing",
1037
+ address="Cham",
1038
+ pages="415--426",
1039
+ isbn="978-3-030-79457-6",
1040
+ url="https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35",
1041
+ }
1042
+ """
1043
+
1044
+ name = "vihsd"
1045
+ description = "ViHSD Vietnamese toxicity classification task"
1046
+ tags = ["toxicity_detection"]
1047
+
1048
+ def __init__(self):
1049
+ super().__init__()
1050
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
1051
+ self.id2label = {
1052
+ 0: "Sạch",
1053
+ 1: "Công kích",
1054
+ 2: "Thù ghét",
1055
+ }
1056
+
1057
+ def download_dataset(self, output_path: str):
1058
+ URL = "https://raw.githubusercontent.com/sonlam1102/vihsd/main/data/vihsd.zip"
1059
+ data_path = os.path.join(output_path, "data")
1060
+ ensure_file_downloaded(source_url=URL, target_path=data_path, unpack=True)
1061
+
1062
+ dataset: Dict[str, pd.DataFrame] = {}
1063
+ for split in self.splits.keys():
1064
+ target_path_file = os.path.join(data_path, "vihsd", f"{split}.csv")
1065
+ df = pd.read_csv(target_path_file)
1066
+ data = df.groupby("label_id", group_keys=False).apply(
1067
+ lambda x: x.sample(frac=1000 / len(df), random_state=4878)
1068
+ )
1069
+ dataset[split] = data
1070
+ return dataset
1071
+
1072
+ def get_instances(self, output_path) -> List[Instance]:
1073
+ dataset = self.download_dataset(output_path)
1074
+ outputs = []
1075
+ for split in self.splits.keys():
1076
+ data = dataset[split]
1077
+ for _, row in data.iterrows():
1078
+ input = Input(str(row["free_text"]).strip())
1079
+ output = Output(text=self.id2label[int(row["label_id"])])
1080
+ references = [
1081
+ Reference(output, tags=[CORRECT_TAG]),
1082
+ ]
1083
+ instance = Instance(input=input, references=references, split=self.splits[split])
1084
+ outputs.append(instance)
1085
+ return outputs
1086
+
1087
+ def get_metadata(self) -> ScenarioMetadata:
1088
+ return ScenarioMetadata(
1089
+ name="vihsd",
1090
+ display_name="ViHSD",
1091
+ short_display_name=None,
1092
+ description="ViHSD [(Luu, "
1093
+ "2021)](https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35 )is a "
1094
+ "Vietnamese toxicity detection dataset obtained from comments on Facebook, "
1095
+ "Youtube, Instagram, and Tiktok.\n",
1096
+ taxonomy=TaxonomyInfo(
1097
+ task="toxicity detection/classification",
1098
+ what="social media comments",
1099
+ when="?",
1100
+ who="Social media users",
1101
+ language="Vietnamese",
1102
+ ),
1103
+ main_metric="classification_macro_f1",
1104
+ main_split="test",
1105
+ )
1106
+
1107
+
1108
+ # 3.3 Thai: Thai Toxicity Tweets
1109
+ class ThaiToxicityTweetsScenario(Scenario):
1110
+ """
1111
+ Thai Toxicity Tweets is a Thai toxicity detection scenario. The data is obtained from Twitter.
1112
+ Instances with no labels or had "TWEET_NOT_FOUND" as the text were dropped from the dataset.
1113
+ The labels are either Y (the text is toxic) or N (the text is clean).
1114
+
1115
+ The models are prompted using the following format:
1116
+
1117
+ คุณเป็นเครื่องตรวจจับข้อความที่เป็นพิษ
1118
+ ข้อความเป็นพิษถ้ามันทำร้าย สร้างความเสียหาย หรือสื่อเจตนาเชิงลบถึงบุคคลหรือกลุ่มบุคคล
1119
+ และคุณควรคำนึงถึงการประชดประชันด้วย
1120
+ เมื่อได้รับข้อความ ให้ตอบด้วยตัวอักษร Y หากเป็นพิษ และ N หากไม่เป็นพิษ
1121
+
1122
+ ข้อความ: <text>
1123
+ คำตอบ: <toxicity>
1124
+
1125
+ ...
1126
+
1127
+ ข้อความ: <text>
1128
+ คำตอบ:
1129
+
1130
+ Target completion:
1131
+ <toxicity>
1132
+
1133
+ @inproceedings{sirihattasak2018annotation,
1134
+ title={Annotation and classification of toxicity for Thai Twitter},
1135
+ author={Sirihattasak, Sugan and Komachi, Mamoru and Ishikawa, Hiroshi},
1136
+ booktitle={TA-COS 2018: 2nd Workshop on Text Analytics for Cybersecurity and Online Safety},
1137
+ pages={1},
1138
+ year={2018},
1139
+ url={http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf},
1140
+ }
1141
+ """
1142
+
1143
+ name = "thaitoxicitytweets"
1144
+ description = "Thai Toxicity Tweets toxicity detection task"
1145
+ tags = ["toxicity_detection"]
1146
+
1147
+ def __init__(self):
1148
+ super().__init__()
1149
+ self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
1150
+ self.id2label = {
1151
+ 0: "N",
1152
+ 1: "Y",
1153
+ }
1154
+
1155
+ def get_instances(self, output_path) -> List[Instance]:
1156
+ dataset = datasets.load_dataset(
1157
+ "tmu-nlp/thai_toxicity_tweet",
1158
+ split="train",
1159
+ revision="aa021e41d0ee6dbee2975fbed620ec8c586bdaf6",
1160
+ trust_remote_code=True,
1161
+ )
1162
+ df = dataset.to_pandas()
1163
+
1164
+ # Drop instances where there are no labels or text is "TWEET_NOT_FOUND"
1165
+ df = df[df["tweet_text"].str.len() > 0]
1166
+ df = df[df["tweet_text"] != "TWEET_NOT_FOUND"]
1167
+
1168
+ df_test = df.groupby("is_toxic", group_keys=False).apply(
1169
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
1170
+ )
1171
+
1172
+ # In-context examples to be drawn from remaining examples (since there is no train/dev data)
1173
+ df_train = df[~df.index.isin(df_test.index)]
1174
+
1175
+ dataset = {
1176
+ "train": df_train,
1177
+ "test": df_test,
1178
+ }
1179
+
1180
+ outputs = []
1181
+ for split in self.splits.keys():
1182
+ data = dataset[split]
1183
+ for _, row in data.iterrows():
1184
+ input = Input(row["tweet_text"].strip())
1185
+ output = Output(text=self.id2label[int(row["is_toxic"])])
1186
+ references = [
1187
+ Reference(output, tags=[CORRECT_TAG]),
1188
+ ]
1189
+ instance = Instance(input=input, references=references, split=self.splits[split])
1190
+ outputs.append(instance)
1191
+ return outputs
1192
+
1193
+ def get_metadata(self) -> ScenarioMetadata:
1194
+ return ScenarioMetadata(
1195
+ name="thaitoxicitytweets",
1196
+ display_name="Thai Toxicity Tweets",
1197
+ short_display_name=None,
1198
+ description="Thai Toxicity Tweets [(Sirihattasak, "
1199
+ "2018)](http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf) is a "
1200
+ "Thai toxicity detection dataset obtained from tweets on Twitter. \n",
1201
+ taxonomy=TaxonomyInfo(
1202
+ task="toxicity detection/classification", what="tweets", when="", who="Twitter users", language="Thai"
1203
+ ),
1204
+ main_metric="classification_macro_f1",
1205
+ main_split="test",
1206
+ )
1207
+
1208
+
1209
+ # B. Natural Language Generation
1210
+ # 1. Machine Translation
1211
+
1212
+
1213
+ # 1. Machine Translation: FLoRes-200
1214
+ class FloresScenario(Scenario):
1215
+ """
1216
+ FLoRes-200 is a machine translation scenario for 200+ languages. The data is obtained from English Wikimedia
1217
+ projects (Wikivoyage, Wikijunior and Wikinews), and professionally translated across 200+ languages to obtain a
1218
+ parallel dataset.
1219
+
1220
+ Only the English, Indonesian, Vietnamese, Thai and Tamil subsets are used in this scenario. Both directions
1221
+ (in and out of English) for each Southeast Asian language are included in the scenario.
1222
+
1223
+ The models are prompted using the following general format:
1224
+
1225
+ Translate the following text into <language> language.
1226
+
1227
+ Text: <text>
1228
+ Translation: <translation>
1229
+
1230
+ ...
1231
+
1232
+ Text: <text>
1233
+ Translation:
1234
+
1235
+ Target completion:
1236
+ <translation>
1237
+
1238
+ @article{nllb2022,
1239
+ author = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield,
1240
+ Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang,
1241
+ Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti,
1242
+ John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran,
1243
+ Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao,
1244
+ Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers,
1245
+ Safiyyah Saleem, Holger Schwenk, Jeff Wang
1246
+ },
1247
+ title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
1248
+ year = {2022},
1249
+ url = {https://research.facebook.com/publications/no-language-left-behind/},
1250
+ }
1251
+
1252
+ """
1253
+
1254
+ name = "flores"
1255
+ description = "FLoRes-200 machine translation task"
1256
+ tags = ["machine_translation"]
1257
+
1258
+ def __init__(self, pair: str):
1259
+ super().__init__()
1260
+ self.pair = pair
1261
+ self.source = pair.split("_")[0]
1262
+ self.target = pair.split("_")[1]
1263
+
1264
+ self.splits = {"dev": TRAIN_SPLIT, "devtest": TEST_SPLIT}
1265
+
1266
+ self.languages = {
1267
+ "en": "eng_Latn",
1268
+ "id": "ind_Latn",
1269
+ "vi": "vie_Latn",
1270
+ "th": "tha_Thai",
1271
+ "ta": "tam_Taml",
1272
+ }
1273
+
1274
+ if self.source not in self.languages.keys() or self.target not in self.languages.keys():
1275
+ raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
1276
+
1277
+ def get_instances(self, output_path) -> List[Instance]:
1278
+ source_dataset = datasets.load_dataset(
1279
+ "facebook/flores",
1280
+ self.languages[self.source],
1281
+ revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
1282
+ trust_remote_code=True,
1283
+ )
1284
+ target_dataset = datasets.load_dataset(
1285
+ "facebook/flores",
1286
+ self.languages[self.target],
1287
+ revision="2db78afdeaccaedc3b33a95442a4e55766887e17",
1288
+ trust_remote_code=True,
1289
+ )
1290
+
1291
+ outputs = []
1292
+ for split in self.splits.keys():
1293
+ source_df = source_dataset[split].to_pandas()
1294
+ target_df = target_dataset[split].to_pandas()
1295
+ data = source_df.join(target_df, lsuffix="_source", rsuffix="_target")
1296
+ for _, row in data.iterrows():
1297
+ input = Input(row["sentence_source"].strip())
1298
+ output = Output(row["sentence_target"].strip())
1299
+ references = [
1300
+ Reference(output, tags=[CORRECT_TAG]),
1301
+ ]
1302
+ instance = Instance(input=input, references=references, split=self.splits[split])
1303
+ outputs.append(instance)
1304
+ return outputs
1305
+
1306
+ def get_metadata(self) -> ScenarioMetadata:
1307
+ return ScenarioMetadata(
1308
+ name=f"flores_{self.source}_{self.target}",
1309
+ display_name=f"Flores ({self.source} to {self.target})",
1310
+ short_display_name=None,
1311
+ description="Flores [(NLLB Team, "
1312
+ "2022)](https://research.facebook.com/publications/no-language-left-behind/) "
1313
+ "was created with professional human translators who translate the FLORES "
1314
+ "source dataset into the target languages and a separate group of independent "
1315
+ "translation reviewers who perform quality assessments of the human "
1316
+ "translations and provide translation feedback to the translators.\n",
1317
+ taxonomy=TaxonomyInfo(
1318
+ task="machine translation",
1319
+ what="translations from professional human translators",
1320
+ when="?",
1321
+ who="professional human translators",
1322
+ language=f"{self.source}, {self.target}",
1323
+ ),
1324
+ main_metric="chr_f_plus_plus",
1325
+ main_split="test",
1326
+ )
1327
+
1328
+
1329
+ # C. Natural Language Reasoning
1330
+ # 1. Natural Language Inference
1331
+ # 2. Causal Reasoning
1332
+
1333
+
1334
+ # 1. Natural Language Inference
1335
+ # 1.1 Indonesian: IndoNLI
1336
+ class IndoNLIScenario(Scenario):
1337
+ """
1338
+ IndoNLI is an Indonesian Natural Language Inference (NLI) scenario. The data is sourced from Wikipedia, news,
1339
+ and web articles. Native speakers use premise text from these sources and write hypothesis sentences for each
1340
+ NLI label. The labels are entailment, contradiction, or neutral.
1341
+
1342
+ The models are prompted using the following format:
1343
+
1344
+ Anda akan diberikan dua kalimat, X dan Y.
1345
+ Tentukan mana dari pernyataan berikut ini yang paling sesuai untuk kalimat X dan Y.
1346
+ A: Kalau X benar, maka Y juga harus benar.
1347
+ B: X bertentangan dengan Y.
1348
+ C: Ketika X benar, Y mungkin benar atau mungkin tidak benar.
1349
+ Jawablah dengan satu huruf saja, A, B atau C.
1350
+
1351
+ X: <sentence1>
1352
+ Y: <sentence2>
1353
+ Jawaban: <entailment>
1354
+
1355
+ ...
1356
+
1357
+ X: <sentence1>
1358
+ Y: <sentence2>
1359
+ Jawaban:
1360
+
1361
+ Target completion:
1362
+ <entailment>
1363
+
1364
+ @inproceedings{mahendra-etal-2021-indonli,
1365
+ title = "{I}ndo{NLI}: A Natural Language Inference Dataset for {I}ndonesian",
1366
+ author = "Mahendra, Rahmad and Aji, Alham Fikri and Louvan, Samuel and Rahman, Fahrurrozi and Vania, Clara",
1367
+ booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
1368
+ month = nov,
1369
+ year = "2021",
1370
+ address = "Online and Punta Cana, Dominican Republic",
1371
+ publisher = "Association for Computational Linguistics",
1372
+ url = "https://aclanthology.org/2021.emnlp-main.821",
1373
+ pages = "10511--10527",
1374
+ }
1375
+ """
1376
+
1377
+ name = "indonli"
1378
+ description = "IndoNLI Indonesian Natural Language Inference task"
1379
+ tags = ["natural_language_inference"]
1380
+
1381
+ def __init__(self):
1382
+ super().__init__()
1383
+ self.splits = {
1384
+ "train": TRAIN_SPLIT,
1385
+ "test": TEST_SPLIT,
1386
+ }
1387
+ self.id2label = {"e": "A", "c": "B", "n": "C"}
1388
+
1389
+ def download_dataset(self, output_path: str):
1390
+ URLS = {
1391
+ "train": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/train.jsonl",
1392
+ "test": "https://raw.githubusercontent.com/ir-nlp-csui/indonli/main/data/indonli/test_lay.jsonl",
1393
+ }
1394
+
1395
+ dataset: Dict[str, pd.DataFrame] = {}
1396
+ for split in self.splits.keys():
1397
+ target_path_file = os.path.join(output_path, split)
1398
+ ensure_file_downloaded(source_url=URLS[split], target_path=target_path_file)
1399
+ df = pd.read_json(target_path_file, lines=True)
1400
+ if split == "test":
1401
+ dataset[split] = df.groupby("label", group_keys=False).apply(
1402
+ lambda x: x.sample(frac=1000 / len(df), random_state=4685)
1403
+ )
1404
+ else:
1405
+ dataset[split] = df
1406
+ return dataset
1407
+
1408
+ def get_instances(self, output_path) -> List[Instance]:
1409
+ dataset = self.download_dataset(output_path)
1410
+ outputs = []
1411
+ for split in self.splits.keys():
1412
+ data = dataset[split]
1413
+ for _, row in data.iterrows():
1414
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1415
+ input = Input(passage)
1416
+ output = Output(self.id2label[row["label"]])
1417
+ references = [
1418
+ Reference(output, tags=[CORRECT_TAG]),
1419
+ ]
1420
+ instance = Instance(input=input, references=references, split=self.splits[split])
1421
+ outputs.append(instance)
1422
+ return outputs
1423
+
1424
+ def get_metadata(self) -> ScenarioMetadata:
1425
+ return ScenarioMetadata(
1426
+ name="indonli",
1427
+ display_name="IndoNLI",
1428
+ short_display_name=None,
1429
+ description="IndoNLI [(Mahendra, 2021)](https://aclanthology.org/2021.emnlp-main.821) is a "
1430
+ "natural language inference dataset obtained from Wikipedia, news, and web "
1431
+ "articles that incorporates various linguistic phenomena such as numerical "
1432
+ "reasoning, structural changes, idioms, or temporal and spatial reasoning. \n",
1433
+ taxonomy=TaxonomyInfo(
1434
+ task="natural language inference",
1435
+ what="Wikipedia, news, and web articles",
1436
+ when="?",
1437
+ who="?",
1438
+ language="Indonesian",
1439
+ ),
1440
+ main_metric="exact_match",
1441
+ main_split="test",
1442
+ )
1443
+
1444
+
1445
+ # 1.2 Vietnamese & Thai: XNLI
1446
+ class XNLIScenario(Scenario):
1447
+ """
1448
+ XNLI is a Natural Language Inference scenario for 15 languages. The data was constructed following the
1449
+ MultiNLI crowdsourcing procedure to obtain English data, which was then professionally translated across
1450
+ 14 other languages. Labels are entailment, neutral, or contradiction.
1451
+
1452
+ The models are prompted using the following general format:
1453
+
1454
+ You will be given two sentences, X and Y.
1455
+ Determine which of the following statements applies to sentences X and Y the best.
1456
+ A: If X is true, Y must be true.
1457
+ B: X contradicts Y.
1458
+ C: When X is true, Y may or may not be true.
1459
+ Answer strictly with a single letter A, B or C.
1460
+
1461
+ X: <sentence1>
1462
+ Y: <sentence2>
1463
+ Answer: <entailment>
1464
+
1465
+ ...
1466
+
1467
+ X: <sentence1>
1468
+ Y: <sentence2>
1469
+ Answer:
1470
+
1471
+ Target completion:
1472
+ <entailment>
1473
+
1474
+ @inproceedings{conneau-etal-2018-xnli,
1475
+ title = "{XNLI}: Evaluating Cross-lingual Sentence Representations",
1476
+ author = "Conneau, Alexis and
1477
+ Rinott, Ruty and
1478
+ Lample, Guillaume and
1479
+ Williams, Adina and
1480
+ Bowman, Samuel and
1481
+ Schwenk, Holger and
1482
+ Stoyanov, Veselin",
1483
+ editor = "Riloff, Ellen and
1484
+ Chiang, David and
1485
+ Hockenmaier, Julia and
1486
+ Tsujii, Jun{'}ichi",
1487
+ booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
1488
+ month = oct # "-" # nov,
1489
+ year = "2018",
1490
+ address = "Brussels, Belgium",
1491
+ publisher = "Association for Computational Linguistics",
1492
+ url = "https://aclanthology.org/D18-1269",
1493
+ doi = "10.18653/v1/D18-1269",
1494
+ pages = "2475--2485",
1495
+ }
1496
+ """
1497
+
1498
+ name = "xnli"
1499
+ description = "XNLI Natural Language Inference task"
1500
+ tags = ["natural_language_inference"]
1501
+
1502
+ def __init__(self, language: str):
1503
+ super().__init__()
1504
+ self.language = language
1505
+ self.splits = {
1506
+ "validation": TRAIN_SPLIT,
1507
+ "test": TEST_SPLIT,
1508
+ }
1509
+ self.id2label = {0: "A", 2: "B", 1: "C"}
1510
+ self.supported_languages = ["th", "vi"]
1511
+ if self.language not in self.supported_languages:
1512
+ raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
1513
+
1514
+ def get_instances(self, output_path) -> List[Instance]:
1515
+ dataset = datasets.load_dataset("xnli", self.language)
1516
+ outputs = []
1517
+ for split in self.splits.keys():
1518
+ df = dataset[split].to_pandas()
1519
+ if split == "validation":
1520
+ data = df
1521
+ else:
1522
+ # This produces 999 instances
1523
+ data = df.groupby("label", group_keys=False).apply(
1524
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
1525
+ )
1526
+
1527
+ # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
1528
+ remainder = df[~df.index.isin(data.index)]
1529
+ neutral_instance = remainder[remainder["label"] == 1].iloc[0].to_frame().transpose()
1530
+ data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
1531
+ for _, row in data.iterrows():
1532
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1533
+ input = Input(passage)
1534
+ output = Output(self.id2label[int(row["label"])])
1535
+ references = [
1536
+ Reference(output, tags=[CORRECT_TAG]),
1537
+ ]
1538
+ instance = Instance(input=input, references=references, split=self.splits[split])
1539
+ outputs.append(instance)
1540
+ return outputs
1541
+
1542
+ def get_metadata(self) -> ScenarioMetadata:
1543
+ return ScenarioMetadata(
1544
+ name=f"xnli_{self.language}",
1545
+ display_name=f"XNLI ({self.language})",
1546
+ short_display_name=None,
1547
+ description="XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural "
1548
+ "language inference dataset obtained from crowdsourced NLI data then "
1549
+ "professionally translated across 14 other languages.\n",
1550
+ taxonomy=TaxonomyInfo(
1551
+ task="natural language inference",
1552
+ what="crowdsourced NLI data professionally translated",
1553
+ when="?",
1554
+ who="?",
1555
+ language=self.language,
1556
+ ),
1557
+ main_metric="exact_match",
1558
+ main_split="test",
1559
+ )
1560
+
1561
+
1562
+ # 1.3 Tamil: IndicXNLI
1563
+ class IndicXNLIScenario(Scenario):
1564
+ """
1565
+ IndicXNLI is a Natural Language Inference scenario for 11 Indic languages. The data was
1566
+ automatically translated from the English XNLI dataset into 11 Indic languages using
1567
+ IndicTrans (Ramesh et al., 2021).
1568
+
1569
+ Only the Tamil subset of the data is used in this scenario. The labels are
1570
+ entailment, contradiction and neutral.
1571
+
1572
+ The models are prompted using the following format:
1573
+
1574
+ உங்களுக்கு இரண்டு வாக்கியங்கள், X மற்றும் Y, தரப்படும்.
1575
+ பின்வரும் கூற்றுகளில் எது X மற்றும் Y வாக்கியங்களுடன் மிகப் பொருந்துகிறது எனக் கண்டறியவும்.
1576
+ A: X உண்மை என்றால் Y உம் உண்மையாக இருக்க வேண்டும்.
1577
+ B: X உம் Y உம் முரண்படுகின்றன.
1578
+ C: X உண்மையாக இருக்கும்போது Y உண்மையாக இருக்கலாம் அல்லது இல்லாமல் இருக்கலாம்.
1579
+ A அல்லது B அல்லது C என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.
1580
+
1581
+ X: <premise>
1582
+ Y: <hypothesis>
1583
+ பதில்: <entailment>
1584
+
1585
+ ...
1586
+
1587
+ X: <premise>
1588
+ Y: <hypothesis>
1589
+ பதில்:
1590
+
1591
+ Target completion:
1592
+ <entailment>
1593
+
1594
+ @inproceedings{aggarwal-etal-2022-indicxnli,
1595
+ title = "{I}ndic{XNLI}: Evaluating Multilingual Inference for {I}ndian Languages",
1596
+ author = "Aggarwal, Divyanshu and
1597
+ Gupta, Vivek and
1598
+ Kunchukuttan, Anoop",
1599
+ editor = "Goldberg, Yoav and
1600
+ Kozareva, Zornitsa and
1601
+ Zhang, Yue",
1602
+ booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
1603
+ month = dec,
1604
+ year = "2022",
1605
+ address = "Abu Dhabi, United Arab Emirates",
1606
+ publisher = "Association for Computational Linguistics",
1607
+ url = "https://aclanthology.org/2022.emnlp-main.755",
1608
+ doi = "10.18653/v1/2022.emnlp-main.755",
1609
+ pages = "10994--11006",
1610
+ }
1611
+ """
1612
+
1613
+ name = "indicxnli"
1614
+ description = "IndicXNLI Natural Language Inference task"
1615
+ tags = ["natural_language_inference"]
1616
+
1617
+ def __init__(self):
1618
+ super().__init__()
1619
+ self.splits = {
1620
+ "validation": TRAIN_SPLIT,
1621
+ "test": TEST_SPLIT,
1622
+ }
1623
+ self.id2label = {0: "A", 2: "B", 1: "C"}
1624
+
1625
+ def get_instances(self, output_path) -> List[Instance]:
1626
+ dataset = datasets.load_dataset("Divyanshu/indicxnli", "ta")
1627
+
1628
+ outputs = []
1629
+ for split in self.splits.keys():
1630
+ df = dataset[split].to_pandas()
1631
+ if split == "validation":
1632
+ data = df
1633
+ else:
1634
+ # This produces 999 instances
1635
+ data = df.groupby("label", group_keys=False).apply(
1636
+ lambda x: x.sample(frac=1000 / len(df), random_state=4156)
1637
+ )
1638
+
1639
+ # Add 1 neutral instance from remaining instances to the test data to make 1000 in total
1640
+ remainder = df[~df.index.isin(data.index)]
1641
+ neutral_instance = remainder[remainder["label"] == 2].iloc[0].to_frame().transpose()
1642
+ data = pd.concat([data, neutral_instance], axis=0, ignore_index=True)
1643
+ for _, row in data.iterrows():
1644
+ passage = "X: " + row["premise"].strip() + "\nY: " + row["hypothesis"].strip()
1645
+ input = Input(passage)
1646
+ output = Output(text=self.id2label[row["label"]])
1647
+ references = [
1648
+ Reference(output, tags=[CORRECT_TAG]),
1649
+ ]
1650
+ instance = Instance(input=input, references=references, split=self.splits[split])
1651
+ outputs.append(instance)
1652
+ return outputs
1653
+
1654
+ def get_metadata(self) -> ScenarioMetadata:
1655
+ return ScenarioMetadata(
1656
+ name="indicxnli",
1657
+ display_name="IndicXNLI",
1658
+ short_display_name=None,
1659
+ description="IndicXNLI is a Tamil sentiment analysis dataset that comes from IndicXTREME "
1660
+ "[(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), which "
1661
+ "automatically translated from XNLI into 11 Indic languages.\n",
1662
+ taxonomy=TaxonomyInfo(
1663
+ task="natural language inference",
1664
+ what="crowdsourced NLI data professionally translated into Tamil",
1665
+ when="?",
1666
+ who="?",
1667
+ language="Tamil",
1668
+ ),
1669
+ main_metric="exact_match",
1670
+ main_split="test",
1671
+ )
1672
+
1673
+
1674
+ # 2. Causal Reasoning: XCOPA
1675
+ class XCOPAScenario(Scenario):
1676
+ """
1677
+ XCOPA is a commonsense causal reasoning scenario for 11 languages. The data is sourced from the English
1678
+ COPA dataset and professionally translated across 11 languages to create a parallel dataset.
1679
+
1680
+ Only the Indonesian, Vietnamese, Thai and Tamil subsets were used for this scenario. Each instance consists of
1681
+ a premise and two sentences. The system under test needs to determine which of the two sentences is more likely
1682
+ to be the cause/effect of the premise. Whether the cause or the effect is asked for differs from instance to
1683
+ instance. Although there should be an equal number of instances asking for the cause and for the effect, it was
1684
+ found in the BHASA paper (Leong et al., 2023) that this was not the case for Indonesian and Thai. The
1685
+ cause/effect label is fixed in this scenario by harmonizing the labels across the four languages based on the
1686
+ Tamil subset as the reference.
1687
+
1688
+ The models are prompted using the following general format:
1689
+
1690
+ Based on the following situation, which of the following choices is most likely to be its {cause/effect}?
1691
+ Answer only with a single letter A or B.
1692
+
1693
+ Situation: <premise>
1694
+ A: <choice1>
1695
+ B: <choice2>
1696
+ Answer: <answer>
1697
+
1698
+ ...
1699
+
1700
+ Situation: <premise>
1701
+ A: <choice1>
1702
+ B: <choice2>
1703
+ Answer:
1704
+
1705
+ Target completion:
1706
+ <answer>
1707
+
1708
+ @article{ponti2020xcopa,
1709
+ title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
1710
+ author={Edoardo M. Ponti, Goran Glava
1711
+ {s}, Olga Majewska, Qianchu Liu, Ivan Vuli'{c} and Anna Korhonen},
1712
+ journal={arXiv preprint},
1713
+ year={2020},
1714
+ url={https://ducdauge.github.io/files/xcopa.pdf}
1715
+ }
1716
+
1717
+ @inproceedings{roemmele2011choice,
1718
+ title={Choice of plausible alternatives: An evaluation of commonsense causal reasoning},
1719
+ author={Roemmele, Melissa and Bejan, Cosmin Adrian and Gordon, Andrew S},
1720
+ booktitle={2011 AAAI Spring Symposium Series},
1721
+ year={2011},
1722
+ url={https://people.ict.usc.edu/~gordon/publications/AAAI-SPRING11A.PDF},
1723
+ }
1724
+ """
1725
+
1726
+ name = "xcopa"
1727
+ description = "XCOPA causal reasoning task"
1728
+ tags = ["causal_reasoning"]
1729
+
1730
+ def __init__(self, language: str):
1731
+ super().__init__()
1732
+ self.language = language
1733
+ self.splits = {
1734
+ "validation": TRAIN_SPLIT,
1735
+ "test": TEST_SPLIT,
1736
+ }
1737
+ self.id2label = {
1738
+ 0: "A",
1739
+ 1: "B",
1740
+ }
1741
+ self.language_to_prompt_components = {
1742
+ "id": {
1743
+ "cause": "sebab",
1744
+ "effect": "akibat",
1745
+ "instruction1": "Berdasarkan situasi di atas, mana dari pilihan-pilihan berikut ini yang lebih "
1746
+ "mungkin menjadi {}?",
1747
+ "instruction2": "Jawablah dengan satu huruf saja, A atau B.",
1748
+ },
1749
+ "ta": {
1750
+ "cause": "காரணமாக",
1751
+ "effect": "விளைவாக",
1752
+ "instruction1": "பின்வரும் வாக்கியங்களில் பெரும்பாலும் எது தரப்பட்ட சூழ்நிலைக்குரிய {} இருக்கும்?",
1753
+ "instruction2": "A அல்லது B என்ற ஒறே எழுத்தில் மட்டும் பதிலளிக்கவும்.",
1754
+ },
1755
+ "th": {
1756
+ "cause": "สาเหตุ",
1757
+ "effect": "ผล",
1758
+ "instruction1": "เมื่อพิจารณาจากสถานการณ์นี้ ตัวเลือกใดต่อไปนี้น่าจะเป็น{}มากกว่ากัน?",
1759
+ "instruction2": "กรุณาตอบด้วยตัวอักษร A หรือ B ตัวเดียวเท่านั้น",
1760
+ },
1761
+ "vi": {
1762
+ "cause": "nguyên nhân",
1763
+ "effect": "kết quả",
1764
+ "instruction1": "Với tình huống trên, lựa chọn nào dưới đây có khả năng cao là {} của nó hơn?",
1765
+ "instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
1766
+ },
1767
+ }
1768
+ if self.language not in self.language_to_prompt_components.keys():
1769
+ raise Exception(
1770
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1771
+ )
1772
+ else:
1773
+ self.prompt_components = self.language_to_prompt_components[self.language]
1774
+
1775
+ def get_instances(self, output_path) -> List[Instance]:
1776
+ language_dataset = datasets.load_dataset("xcopa", self.language)
1777
+ tamil_dataset = datasets.load_dataset("xcopa", "ta")
1778
+
1779
+ outputs = []
1780
+ for split in self.splits.keys():
1781
+ language_df = language_dataset[split].to_pandas()
1782
+ tamil_df = tamil_dataset[split].to_pandas()
1783
+ data = pd.merge(
1784
+ language_df, tamil_df[["question", "idx"]], on="idx"
1785
+ ) # Use the Tamil split's question column
1786
+ for _, row in data.iterrows():
1787
+ instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
1788
+ passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
1789
+ premise=row["premise"].strip(),
1790
+ instruction1=instruction1,
1791
+ choice1=row["choice1"].strip(),
1792
+ choice2=row["choice2"].strip(),
1793
+ instruction2=self.prompt_components["instruction2"],
1794
+ )
1795
+ input = Input(passage)
1796
+ output = Output(self.id2label[int(row["label"])])
1797
+ references = [
1798
+ Reference(output, tags=[CORRECT_TAG]),
1799
+ ]
1800
+ instance = Instance(input=input, references=references, split=self.splits[split])
1801
+ outputs.append(instance)
1802
+ return outputs
1803
+
1804
+ def get_metadata(self) -> ScenarioMetadata:
1805
+ return ScenarioMetadata(
1806
+ name=f"xcopa_{self.language}",
1807
+ display_name=f"XCOPA ({self.language})",
1808
+ short_display_name=None,
1809
+ description="XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal "
1810
+ "reasoning dataset, a translation and reannotation of the English COPA. English "
1811
+ "COPA included questions that directly assess commonsense causal reasoning.\n",
1812
+ taxonomy=TaxonomyInfo(
1813
+ task="causal reasoning",
1814
+ what="commonsense causal reasoning questions translated into " "Indonesian",
1815
+ when="?",
1816
+ who="?",
1817
+ language=self.language,
1818
+ ),
1819
+ main_metric="exact_match",
1820
+ main_split="test",
1821
+ )
1822
+
1823
+
1824
+ # 1. Syntax: LINDSEA Minimal Pairs
1825
+ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1826
+ """
1827
+ The LINDSEA Minimal Pairs dataset is a linguistic diagnostic scenario targeting syntactic phenomena.
1828
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1829
+ of quality control. The high-level categories tested for include morphology, argument structure,
1830
+ filler-gap dependencies, as well as negative polarity items and negation.
1831
+
1832
+ The test is designed as a minimal pair, with a pair of sentences that differ minimally from each other
1833
+ and which exemplify a specific syntactic phenomenon. The system under test needs to determine which
1834
+ sentence of the pair is more acceptable.
1835
+
1836
+ The models are prompted using the following general format:
1837
+
1838
+ Which sentence is more acceptable?
1839
+ Answer only with a single letter A or B.
1840
+ <sentence>
1841
+
1842
+ Target completion:
1843
+ <sentence>
1844
+
1845
+ @misc{leong2023bhasa,
1846
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1847
+ author={Wei Qi Leong
1848
+ and Jian Gang Ngui
1849
+ and Yosephine Susanto
1850
+ and Hamsawardhini Rengarajan
1851
+ and Kengatharaiyer Sarveswaran
1852
+ and William Chandra Tjhi
1853
+ },
1854
+ year={2023},
1855
+ eprint={2309.06085},
1856
+ archivePrefix={arXiv},
1857
+ primaryClass={cs.CL},
1858
+ url={https://arxiv.org/abs/2309.06085},
1859
+ }
1860
+ """
1861
+
1862
+ name = "lindsea_minimal_pairs"
1863
+ description = "LINDSEA minimal pairs task"
1864
+ tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
1865
+
1866
+ def __init__(self, method: str, language: str):
1867
+ super().__init__()
1868
+ self.method = method
1869
+ self.language = language
1870
+ self.language_to_prompt_components = {
1871
+ "id": {
1872
+ "instructions": "Kalimat mana yang lebih mungkin?",
1873
+ "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
1874
+ }
1875
+ }
1876
+ if self.language not in self.language_to_prompt_components.keys():
1877
+ raise Exception(
1878
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1879
+ )
1880
+ else:
1881
+ self.prompt_components = self.language_to_prompt_components[self.language]
1882
+
1883
+ def download_dataset(self, output_path: str):
1884
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1885
+ URLS = {
1886
+ "npis_and_negation": f"{BASE_URL}{self.language}/syntax/NPIs_and_negation.jsonl",
1887
+ "argument_structure": f"{BASE_URL}{self.language}/syntax/argument_structure.jsonl",
1888
+ "filler_gap_dependencies": f"{BASE_URL}{self.language}/syntax/filler-gap_dependencies.jsonl",
1889
+ "morphology": f"{BASE_URL}{self.language}/syntax/morphology.jsonl",
1890
+ }
1891
+
1892
+ data_files = {}
1893
+ for file in list(URLS.keys()):
1894
+ target_path_file = os.path.join(output_path, file)
1895
+ ensure_file_downloaded(source_url=URLS[file], target_path=target_path_file)
1896
+ data_files[file] = pd.read_json(target_path_file, lines=True)
1897
+ dataset = pd.concat(data_files)
1898
+
1899
+ return dataset
1900
+
1901
+ def get_instances(self, output_path: str) -> List[Instance]:
1902
+ data = self.download_dataset(output_path)
1903
+
1904
+ outputs = []
1905
+ if self.method == "mcq":
1906
+ category_list = data["category"].value_counts().keys()
1907
+
1908
+ hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
1909
+ for category in category_list:
1910
+ # Fix shuffling within each category
1911
+ random.seed(1)
1912
+ for _, row in data[data["category"] == category].iterrows():
1913
+ options = [(row["correct"], 1), (row["wrong"], 2)]
1914
+ random.shuffle(options)
1915
+ options_reversed = True if options[0][1] == 2 else False
1916
+ instructions = self.prompt_components["instructions"]
1917
+ output_prefix = self.prompt_components["output_prefix"]
1918
+ prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
1919
+ input = Input(text=prompt)
1920
+ # Determine correct option based on whether shuffling reversed the options
1921
+ references = [
1922
+ Reference(Output(text="A"), tags=[] if options_reversed else [CORRECT_TAG]),
1923
+ Reference(Output(text="B"), tags=[CORRECT_TAG] if options_reversed else []),
1924
+ ]
1925
+ instance = Instance(input=input, references=references, split=TEST_SPLIT)
1926
+ outputs.append(instance)
1927
+
1928
+ else:
1929
+ for _, row in data.iterrows():
1930
+ # No need to shuffle since we are comparing logprobs of the options separately
1931
+ input = Input(text="")
1932
+ references = [
1933
+ Reference(Output(text=row["correct"].strip()), tags=[CORRECT_TAG]),
1934
+ Reference(Output(text=row["wrong"].strip()), tags=[]),
1935
+ ]
1936
+ instance = Instance(
1937
+ input=input,
1938
+ references=references,
1939
+ split=TEST_SPLIT,
1940
+ )
1941
+ outputs.append(instance)
1942
+ return outputs
1943
+
1944
+ def get_metadata(self) -> ScenarioMetadata:
1945
+ return ScenarioMetadata(
1946
+ name=f"lindsea_syntax_minimal_pairs_{self.language}",
1947
+ display_name="LINDSEA Syntax Minimal Pairs",
1948
+ short_display_name=None,
1949
+ description="LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA "
1950
+ "[(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of "
1951
+ "sentences that differ minimally from each other and contrast in grammatical "
1952
+ "acceptability.\n",
1953
+ taxonomy=TaxonomyInfo(
1954
+ task="minimal pairs",
1955
+ what="sentence pairs with minimal differences and constrasting " "grammatical acceptability",
1956
+ when="?",
1957
+ who="?",
1958
+ language=self.language,
1959
+ ),
1960
+ main_metric="exact_match",
1961
+ main_split="test",
1962
+ )
1963
+
1964
+
1965
+ # 2.1 Pragmatics: LINDSEA Presuppositions
1966
+ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1967
+ """
1968
+ The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
1969
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1970
+ of quality control.
1971
+
1972
+ The presuppositions dataset involves two formats: single and pair sentences.
1973
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
1974
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
1975
+ from another sentence.
1976
+
1977
+ For the single format, the models are prompted using the following general format:
1978
+
1979
+ Is the following statement true or false?
1980
+ Statement: <sentence>
1981
+ Answer only with True or False.
1982
+
1983
+ For the pair format, the models are prompted using the following general format:
1984
+
1985
+ Situation: <premise>
1986
+ Given this situation, is the following statement true or false?
1987
+ Statement: <hypothesis>
1988
+ Answer only with True or False.
1989
+
1990
+ Target completion:
1991
+ <answer>
1992
+
1993
+ @misc{leong2023bhasa,
1994
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
1995
+ author={Wei Qi Leong
1996
+ and Jian Gang Ngui
1997
+ and Yosephine Susanto
1998
+ and Hamsawardhini Rengarajan
1999
+ and Kengatharaiyer Sarveswaran
2000
+ and William Chandra Tjhi
2001
+ },
2002
+ year={2023},
2003
+ eprint={2309.06085},
2004
+ archivePrefix={arXiv},
2005
+ primaryClass={cs.CL}
2006
+ }
2007
+ """
2008
+
2009
+ name = "lindsea_pragmatics_presuppositions"
2010
+ description = "LINDSEA presuppositions task"
2011
+ tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
2012
+
2013
+ def __init__(self, language: str, subset: str):
2014
+ super().__init__()
2015
+ self.language = language
2016
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
2017
+ self.language_to_prompt_components = {
2018
+ "id": {
2019
+ "text_noun": "Pernyataan",
2020
+ "premise_noun": "Situasi",
2021
+ "conclusion_noun": "Pernyataan",
2022
+ "single_question": "Apakah pernyataan berikut ini {}?",
2023
+ "single_instruction": "Jawablah dengan {} saja.",
2024
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
2025
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
2026
+ "True": "Benar",
2027
+ "False": "Salah",
2028
+ },
2029
+ }
2030
+ if self.language not in self.language_to_prompt_components.keys():
2031
+ raise Exception(
2032
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
2033
+ )
2034
+ else:
2035
+ self.prompt_components = self.language_to_prompt_components[self.language]
2036
+
2037
+ def download_dataset(self, output_path: str):
2038
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
2039
+ datasets = []
2040
+ for subset in self.subsets:
2041
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
2042
+ file = f"pragmatic_reasoning_{subset}.jsonl"
2043
+ target_path_file = os.path.join(output_path, file)
2044
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
2045
+ data = pd.read_json(target_path_file, lines=True)
2046
+ data["subset"] = subset
2047
+ data = data[data["linguistic_phenomenon"] == "presuppositions"]
2048
+ datasets.append(data)
2049
+ dataset = pd.concat(datasets)
2050
+ return dataset
2051
+
2052
+ def get_instances(self, output_path) -> List[Instance]:
2053
+ data = self.download_dataset(output_path)
2054
+ outputs = []
2055
+ for _, row in data.iterrows():
2056
+ passage = None
2057
+ references = []
2058
+
2059
+ if row["subset"] == "single":
2060
+ question = self.prompt_components["single_question"]
2061
+ text_noun = self.prompt_components["text_noun"]
2062
+ instruction = self.prompt_components["single_instruction"]
2063
+
2064
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
2065
+ question=question.format(row["question_translated"]),
2066
+ text_noun=text_noun,
2067
+ text=row["text"],
2068
+ instruction=instruction.format(row["choices_translated"]),
2069
+ )
2070
+ # Split "True or False" into ["True", "or", "False"]
2071
+ choices = row["choices"].split()
2072
+ choices_translated = row["choices_translated"].split()
2073
+ label2choice = {
2074
+ choices[0]: choices_translated[0],
2075
+ choices[2]: choices_translated[2],
2076
+ }
2077
+ references.append(
2078
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
2079
+ )
2080
+
2081
+ elif row["subset"] == "pair":
2082
+ premise_noun = self.prompt_components["premise_noun"]
2083
+ question = self.prompt_components["pair_question"]
2084
+ conclusion_noun = self.prompt_components["conclusion_noun"]
2085
+ instruction = self.prompt_components["pair_instruction"]
2086
+ label = self.prompt_components[str(row["label"])]
2087
+
2088
+ passage = (
2089
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
2090
+ premise_noun=premise_noun,
2091
+ premise=row["text"],
2092
+ question=question,
2093
+ conclusion_noun=conclusion_noun,
2094
+ conclusion=row["conclusion"],
2095
+ instruction=instruction,
2096
+ )
2097
+ )
2098
+
2099
+ references.append(
2100
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
2101
+ )
2102
+
2103
+ input = Input(text=str(passage))
2104
+ instance = Instance(
2105
+ input=input,
2106
+ references=references,
2107
+ split=TEST_SPLIT,
2108
+ )
2109
+ outputs.append(instance)
2110
+ return outputs
2111
+
2112
+ def get_metadata(self) -> ScenarioMetadata:
2113
+ return ScenarioMetadata(
2114
+ name=f"lindsea_pragmatics_presuppositions_{self.language}",
2115
+ display_name="LINDSEA Pragmatics Presuppositions",
2116
+ short_display_name=None,
2117
+ description="LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics "
2118
+ "dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), "
2119
+ "involving two formats: single and pair sentences. For single sentence "
2120
+ "questions, the system under test needs to determine if the sentence is "
2121
+ "true/false. For pair sentence questions, the system under test needs to "
2122
+ "determine whether a conclusion can be drawn from another sentence.\n",
2123
+ taxonomy=TaxonomyInfo(
2124
+ task="pragmatic reasoning", what="presuppositions", when="?", who="?", language=self.language
2125
+ ),
2126
+ main_metric="exact_match",
2127
+ main_split="test",
2128
+ )
2129
+
2130
+
2131
+ # 2.2 Pragmatics: LINDSEA Scalar Implicatures
2132
+ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
2133
+ """
2134
+ The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
2135
+ The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
2136
+ of quality control.
2137
+
2138
+ The scalar implicatures dataset involves two formats: single and pair sentences.
2139
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
2140
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
2141
+ from another sentence.
2142
+
2143
+ For the single format, the models are prompted using the following general format:
2144
+
2145
+ Is the following statement true or false?
2146
+ Statement: <sentence>
2147
+ Answer only with True or False.
2148
+
2149
+ For the pair format, the models are prompted using the following general format:
2150
+
2151
+ Situation: <premise>
2152
+ Given this situation, is the following statement true or false?
2153
+ Statement: <hypothesis>
2154
+ Answer only with True or False.
2155
+
2156
+ Target completion:
2157
+ <answer>
2158
+
2159
+ @misc{leong2023bhasa,
2160
+ title={BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation Suite for Large Language Models},
2161
+ author={Wei Qi Leong
2162
+ and Jian Gang Ngui
2163
+ and Yosephine Susanto
2164
+ and Hamsawardhini Rengarajan
2165
+ and Kengatharaiyer Sarveswaran
2166
+ and William Chandra Tjhi
2167
+ },
2168
+ year={2023},
2169
+ eprint={2309.06085},
2170
+ archivePrefix={arXiv},
2171
+ primaryClass={cs.CL}
2172
+ }
2173
+ """
2174
+
2175
+ name = "lindsea_pragmatics_scalar_implicatures"
2176
+ description = "LINDSEA scalar implicatures task"
2177
+ tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
2178
+
2179
+ def __init__(self, language: str, subset: str):
2180
+ super().__init__()
2181
+ self.language = language
2182
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
2183
+ self.language_to_prompt_components = {
2184
+ "id": {
2185
+ "text_noun": "Pernyataan",
2186
+ "premise_noun": "Situasi",
2187
+ "conclusion_noun": "Pernyataan",
2188
+ "single_question": "Apakah pernyataan berikut ini {}?",
2189
+ "single_instruction": "Jawablah dengan {} saja.",
2190
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
2191
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
2192
+ "True": "Benar",
2193
+ "False": "Salah",
2194
+ },
2195
+ }
2196
+ if self.language not in self.language_to_prompt_components.keys():
2197
+ raise Exception(
2198
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
2199
+ )
2200
+ else:
2201
+ self.prompt_components = self.language_to_prompt_components[self.language]
2202
+
2203
+ def download_dataset(self, output_path: str):
2204
+ BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
2205
+ datasets = []
2206
+ for subset in self.subsets:
2207
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
2208
+ file = f"pragmatic_reasoning_{subset}.jsonl"
2209
+ target_path_file = os.path.join(output_path, file)
2210
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
2211
+ data = pd.read_json(target_path_file, lines=True)
2212
+ data["subset"] = subset
2213
+ data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
2214
+ datasets.append(data)
2215
+ dataset = pd.concat(datasets)
2216
+ return dataset
2217
+
2218
+ def get_instances(self, output_path) -> List[Instance]:
2219
+ data = self.download_dataset(output_path)
2220
+ outputs = []
2221
+ for _, row in data.iterrows():
2222
+ passage = None
2223
+ references = []
2224
+
2225
+ if row["subset"] == "single":
2226
+ question = self.prompt_components["single_question"]
2227
+ text_noun = self.prompt_components["text_noun"]
2228
+ instruction = self.prompt_components["single_instruction"]
2229
+
2230
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
2231
+ question=question.format(row["question_translated"]),
2232
+ text_noun=text_noun,
2233
+ text=row["text"],
2234
+ instruction=instruction.format(row["choices_translated"]),
2235
+ )
2236
+ # Split "True or False" into ["True", "or", "False"]
2237
+ choices = row["choices"].split()
2238
+ choices_translated = row["choices_translated"].split()
2239
+ label2choice = {
2240
+ choices[0]: choices_translated[0],
2241
+ choices[2]: choices_translated[2],
2242
+ }
2243
+ references.append(
2244
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
2245
+ )
2246
+
2247
+ elif row["subset"] == "pair":
2248
+ premise_noun = self.prompt_components["premise_noun"]
2249
+ question = self.prompt_components["pair_question"]
2250
+ conclusion_noun = self.prompt_components["conclusion_noun"]
2251
+ instruction = self.prompt_components["pair_instruction"]
2252
+ label = self.prompt_components[str(row["label"])]
2253
+
2254
+ passage = (
2255
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
2256
+ premise_noun=premise_noun,
2257
+ premise=row["text"],
2258
+ question=question,
2259
+ conclusion_noun=conclusion_noun,
2260
+ conclusion=row["conclusion"],
2261
+ instruction=instruction,
2262
+ )
2263
+ )
2264
+
2265
+ references.append(
2266
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
2267
+ )
2268
+
2269
+ input = Input(text=str(passage))
2270
+ instance = Instance(
2271
+ input=input,
2272
+ references=references,
2273
+ split=TEST_SPLIT,
2274
+ )
2275
+ outputs.append(instance)
2276
+ return outputs
2277
+
2278
+ def get_metadata(self) -> ScenarioMetadata:
2279
+ return ScenarioMetadata(
2280
+ name=f"lindsea_pragmatics_scalar_implicatures_{self.language}",
2281
+ display_name="LINDSEA Pragmatics Scalar Implicatures",
2282
+ short_display_name=None,
2283
+ description="LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for "
2284
+ "pragmatics dataset from BHASA [(Leong, "
2285
+ "2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and "
2286
+ "pair sentences. For single sentence questions, the system under test needs to "
2287
+ "determine if the sentence is true/false. For pair sentence questions, the "
2288
+ "system under test needs to determine whether a conclusion can be drawn from "
2289
+ "another sentence.\n",
2290
+ taxonomy=TaxonomyInfo(
2291
+ task="pragmatic reasoning", what="scalar implicatures", when="?", who="?", language=self.language
2292
+ ),
2293
+ main_metric="exact_match",
2294
+ main_split="test",
2295
+ )