crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,88 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
5
+ from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples # type: ignore
6
+ from helm.benchmark.scenarios.scenario import (
7
+ VALID_SPLIT,
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+
16
+
17
+ _DATASET_TO_URL = {
18
+ "hotpotqa": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json",
19
+ "squad": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
20
+ }
21
+
22
+
23
+ class _RULERQAScenario(Scenario):
24
+ name = "ruler_qa"
25
+ description = "A QA scenario from Ruler"
26
+ tags = ["long_context", "rag"]
27
+
28
+ _TEMPLATE = """Answer the question based on the given documents. Only give me the answer and do not output any other words.
29
+
30
+ The following are given documents.
31
+
32
+ {context}
33
+
34
+ Answer the question based on the given documents. Only give me the answer and do not output any other words.
35
+
36
+ Question: {query} Answer:""" # noqa: E501
37
+
38
+ def __init__(self, dataset: str, max_num_words: int):
39
+ super().__init__()
40
+ self.dataset = dataset or "hotpotqa"
41
+ self.max_num_words = max_num_words
42
+
43
+ def get_instances(self, output_path: str) -> List[Instance]:
44
+ data_dir = os.path.join(output_path, "data")
45
+ ensure_directory_exists(data_dir)
46
+ file_path = os.path.join(data_dir, f"{self.dataset}.json")
47
+ url = _DATASET_TO_URL[self.dataset]
48
+ ensure_file_downloaded(url, file_path)
49
+ instances: List[Instance] = []
50
+ samples = generate_samples(
51
+ dataset=self.dataset,
52
+ dataset_path=file_path,
53
+ max_seq_length=self.max_num_words,
54
+ tokens_to_generate=32,
55
+ num_samples=500,
56
+ random_seed=42,
57
+ pre_samples=0,
58
+ template=self._TEMPLATE,
59
+ )
60
+ for sample in samples:
61
+ instance = Instance(
62
+ id=sample["index"],
63
+ input=Input(text=sample["input"]),
64
+ references=[
65
+ Reference(Output(text=output_text), tags=[CORRECT_TAG]) for output_text in sample["outputs"]
66
+ ],
67
+ split=VALID_SPLIT,
68
+ )
69
+ instances.append(instance)
70
+ return instances
71
+
72
+
73
+ class RULERHotpotQAScenario(_RULERQAScenario):
74
+ name = "ruler_hotpotqa"
75
+ description = "The HotpotQA long-context multi-hop RAG question answering scenario from RULER"
76
+ tags = ["long_context", "rag"]
77
+
78
+ def __init__(self, max_num_words: int):
79
+ super().__init__("hotpotqa", max_num_words)
80
+
81
+
82
+ class RULERSQuADScenario(_RULERQAScenario):
83
+ name = "ruler_squad"
84
+ description = "The SQuAD question answering scenario from RULER"
85
+ tags = ["long_context", "rag"]
86
+
87
+ def __init__(self, max_num_words: int):
88
+ super().__init__("squad", max_num_words)
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass, field, replace
3
- from typing import List, Optional, Tuple
3
+ from typing import Dict, List, Optional, Tuple, Any
4
4
  import os
5
5
  from pathlib import PurePath
6
6
  import inspect
@@ -67,6 +67,11 @@ class Input:
67
67
  multimedia_content: Optional[MultimediaObject] = None
68
68
  """A single input can consists of multimodal content interleaved (e.g., text, image, text, ...)."""
69
69
 
70
+ messages: Optional[List[Dict[str, str]]] = None
71
+ """Used for chat models.
72
+ If messages is specified for a chat model, the prompt is ignored.
73
+ Otherwise, the client should convert the prompt into a message."""
74
+
70
75
 
71
76
  @dataclass(frozen=True)
72
77
  class PassageQuestionInput(Input):
@@ -153,6 +158,9 @@ class Instance:
153
158
  contrast_references: Optional[List[List[Reference]]] = None
154
159
  """References for the perturbed input above (if available)"""
155
160
 
161
+ extra_data: Optional[Dict[str, Any]] = None
162
+ """Extra data required by the scenario e.g. chain-of-thought annotations"""
163
+
156
164
  @property
157
165
  def first_correct_reference(self) -> Optional[Reference]:
158
166
  """Return the first correct reference."""
@@ -19,7 +19,7 @@ from helm.benchmark.scenarios.scenario import (
19
19
  from helm.common.general import ensure_file_downloaded
20
20
  from helm.common.hierarchical_logger import hlog
21
21
 
22
- # BHASA Scenarios
22
+ # SEA-HELM Scenarios
23
23
  # A. Natural Language Understanding
24
24
  # B. Natural Language Generation
25
25
  # C. Natural Language Reasoning
@@ -95,7 +95,12 @@ class TyDiQAScenario(Scenario):
95
95
  self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
96
96
 
97
97
  def get_instances(self, output_path) -> List[Instance]:
98
- dataset = datasets.load_dataset("khalidalt/tydiqa-goldp", "indonesian")
98
+ dataset = datasets.load_dataset(
99
+ "khalidalt/tydiqa-goldp",
100
+ "indonesian",
101
+ revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
102
+ trust_remote_code=True,
103
+ )
99
104
 
100
105
  outputs = []
101
106
  for split in self.splits.keys():
@@ -171,7 +176,7 @@ class XQuADScenario(Scenario):
171
176
  super().__init__()
172
177
  self.language = language
173
178
  self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
174
- self.map = {
179
+ self.language_to_prompt_components = {
175
180
  "th": {
176
181
  "passage_prefix": "ข้อความ: ",
177
182
  "question_prefix": "คำถาม: ",
@@ -183,13 +188,19 @@ class XQuADScenario(Scenario):
183
188
  "random_state": 4502,
184
189
  },
185
190
  }
191
+ if self.language not in self.language_to_prompt_components.keys():
192
+ raise Exception(
193
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
194
+ )
195
+ else:
196
+ self.prompt_components = self.language_to_prompt_components[self.language]
186
197
 
187
198
  def get_instances(self, output_path) -> List[Instance]:
188
199
  dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
189
200
  df = dataset.to_pandas()
190
201
 
191
202
  # Sample 1000 examples for test
192
- df_test = df.sample(n=1000, random_state=self.map[self.language]["random_state"])
203
+ df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
193
204
 
194
205
  # In-context examples to be drawn from remaining examples (since there is no train data)
195
206
  df_train = df[~df.index.isin(df_test.index)]
@@ -210,8 +221,8 @@ class XQuADScenario(Scenario):
210
221
  input = PassageQuestionInput(
211
222
  passage=passage,
212
223
  question=question,
213
- passage_prefix=str(self.map[self.language]["passage_prefix"]),
214
- question_prefix=str(self.map[self.language]["question_prefix"]),
224
+ passage_prefix=str(self.prompt_components["passage_prefix"]),
225
+ question_prefix=str(self.prompt_components["question_prefix"]),
215
226
  )
216
227
  references = []
217
228
  for answer in row["answers"]["text"]:
@@ -1068,6 +1079,9 @@ class FloresScenario(Scenario):
1068
1079
  "ta": "tam_Taml",
1069
1080
  }
1070
1081
 
1082
+ if self.source not in self.languages.keys() or self.target not in self.languages.keys():
1083
+ raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
1084
+
1071
1085
  def get_instances(self, output_path) -> List[Instance]:
1072
1086
  source_dataset = datasets.load_dataset(
1073
1087
  "facebook/flores",
@@ -1259,6 +1273,9 @@ class XNLIScenario(Scenario):
1259
1273
  "test": TEST_SPLIT,
1260
1274
  }
1261
1275
  self.id2label = {0: "A", 2: "B", 1: "C"}
1276
+ self.supported_languages = ["th", "vi"]
1277
+ if self.language not in self.supported_languages:
1278
+ raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
1262
1279
 
1263
1280
  def get_instances(self, output_path) -> List[Instance]:
1264
1281
  dataset = datasets.load_dataset("xnli", self.language)
@@ -1449,7 +1466,7 @@ class XCOPAScenario(Scenario):
1449
1466
  0: "A",
1450
1467
  1: "B",
1451
1468
  }
1452
- self.prompt = {
1469
+ self.language_to_prompt_components = {
1453
1470
  "id": {
1454
1471
  "cause": "sebab",
1455
1472
  "effect": "akibat",
@@ -1476,6 +1493,12 @@ class XCOPAScenario(Scenario):
1476
1493
  "instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
1477
1494
  },
1478
1495
  }
1496
+ if self.language not in self.language_to_prompt_components.keys():
1497
+ raise Exception(
1498
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1499
+ )
1500
+ else:
1501
+ self.prompt_components = self.language_to_prompt_components[self.language]
1479
1502
 
1480
1503
  def get_instances(self, output_path) -> List[Instance]:
1481
1504
  language_dataset = datasets.load_dataset("xcopa", self.language)
@@ -1489,15 +1512,13 @@ class XCOPAScenario(Scenario):
1489
1512
  language_df, tamil_df[["question", "idx"]], on="idx"
1490
1513
  ) # Use the Tamil split's question column
1491
1514
  for _, row in data.iterrows():
1492
- instruction1 = self.prompt[self.language]["instruction1"].format(
1493
- self.prompt[self.language][row["question_y"]]
1494
- )
1515
+ instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
1495
1516
  passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
1496
1517
  premise=row["premise"].strip(),
1497
1518
  instruction1=instruction1,
1498
1519
  choice1=row["choice1"].strip(),
1499
1520
  choice2=row["choice2"].strip(),
1500
- instruction2=self.prompt[self.language]["instruction2"],
1521
+ instruction2=self.prompt_components["instruction2"],
1501
1522
  )
1502
1523
  input = Input(passage)
1503
1524
  output = Output(self.id2label[int(row["label"])])
@@ -1549,18 +1570,24 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1549
1570
 
1550
1571
  name = "lindsea_minimal_pairs"
1551
1572
  description = "LINDSEA minimal pairs task"
1552
- tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"]
1573
+ tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
1553
1574
 
1554
1575
  def __init__(self, method: str, language: str):
1555
1576
  super().__init__()
1556
1577
  self.method = method
1557
1578
  self.language = language
1558
- self.prompts = {
1579
+ self.language_to_prompt_components = {
1559
1580
  "id": {
1560
1581
  "instructions": "Kalimat mana yang lebih mungkin?",
1561
1582
  "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
1562
1583
  }
1563
1584
  }
1585
+ if self.language not in self.language_to_prompt_components.keys():
1586
+ raise Exception(
1587
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1588
+ )
1589
+ else:
1590
+ self.prompt_components = self.language_to_prompt_components[self.language]
1564
1591
 
1565
1592
  def download_dataset(self, output_path: str):
1566
1593
  BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
@@ -1586,6 +1613,7 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1586
1613
  outputs = []
1587
1614
  if self.method == "mcq":
1588
1615
  category_list = data["category"].value_counts().keys()
1616
+
1589
1617
  hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
1590
1618
  for category in category_list:
1591
1619
  # Fix shuffling within each category
@@ -1594,10 +1622,8 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1594
1622
  options = [(row["correct"], 1), (row["wrong"], 2)]
1595
1623
  random.shuffle(options)
1596
1624
  options_reversed = True if options[0][1] == 2 else False
1597
-
1598
- prompt_components = self.prompts[self.language]
1599
- instructions = prompt_components["instructions"]
1600
- output_prefix = prompt_components["output_prefix"]
1625
+ instructions = self.prompt_components["instructions"]
1626
+ output_prefix = self.prompt_components["output_prefix"]
1601
1627
  prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
1602
1628
  input = Input(text=prompt)
1603
1629
  # Determine correct option based on whether shuffling reversed the options
@@ -1625,23 +1651,31 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1625
1651
  return outputs
1626
1652
 
1627
1653
 
1628
- # 2. Pragmatics
1629
- # 2.1 LINDSEA Pragmatic Reasoning (single sentence)
1630
- class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
1654
+ # 2.1 Pragmatics: LINDSEA Presuppositions
1655
+ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1631
1656
  """
1632
- The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
1657
+ The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
1633
1658
  The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1634
- of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
1659
+ of quality control.
1635
1660
 
1636
- The single-sentence pragmatic reasoning dataset involves questions targeting the truth value of a single sentence.
1637
- The system under test needs to determine if the sentence is true/false or if the proposition is possible/impossible.
1661
+ The presuppositions dataset involves two formats: single and pair sentences.
1662
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
1663
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
1664
+ from another sentence.
1638
1665
 
1639
- The models are prompted using the following general format:
1666
+ For the single format, the models are prompted using the following general format:
1640
1667
 
1641
1668
  Is the following statement true or false?
1642
1669
  Statement: <sentence>
1643
1670
  Answer only with True or False.
1644
1671
 
1672
+ For the pair format, the models are prompted using the following general format:
1673
+
1674
+ Situation: <premise>
1675
+ Given this situation, is the following statement true or false?
1676
+ Statement: <hypothesis>
1677
+ Answer only with True or False.
1678
+
1645
1679
  Target completion:
1646
1680
  <answer>
1647
1681
 
@@ -1661,50 +1695,101 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
1661
1695
  }
1662
1696
  """
1663
1697
 
1664
- name = "lindsea_pragmatic_reasoning_single"
1665
- description = "LINDSEA pragmatic reasoning single sentence task"
1666
- tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
1698
+ name = "lindsea_pragmatics_presuppositions"
1699
+ description = "LINDSEA presuppositions task"
1700
+ tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
1667
1701
 
1668
- def __init__(self, language: str):
1702
+ def __init__(self, language: str, subset: str):
1669
1703
  super().__init__()
1670
1704
  self.language = language
1671
- self.prompt = {
1705
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
1706
+ self.language_to_prompt_components = {
1672
1707
  "id": {
1673
- "question": "Apakah pernyataan berikut ini {}?",
1674
- "instruction": "Jawablah dengan {} saja.",
1708
+ "text_noun": "Pernyataan",
1709
+ "premise_noun": "Situasi",
1710
+ "conclusion_noun": "Pernyataan",
1711
+ "single_question": "Apakah pernyataan berikut ini {}?",
1712
+ "single_instruction": "Jawablah dengan {} saja.",
1713
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1714
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
1715
+ "True": "Benar",
1716
+ "False": "Salah",
1675
1717
  },
1676
1718
  }
1719
+ if self.language not in self.language_to_prompt_components.keys():
1720
+ raise Exception(
1721
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1722
+ )
1723
+ else:
1724
+ self.prompt_components = self.language_to_prompt_components[self.language]
1677
1725
 
1678
1726
  def download_dataset(self, output_path: str):
1679
1727
  BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1680
- URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_single.jsonl"
1681
- file = "pragmatic_reasoning_single"
1682
- target_path_file = os.path.join(output_path, file)
1683
- ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1684
- dataset = pd.read_json(target_path_file, lines=True)
1728
+ datasets = []
1729
+ for subset in self.subsets:
1730
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
1731
+ file = f"pragmatic_reasoning_{subset}.jsonl"
1732
+ target_path_file = os.path.join(output_path, file)
1733
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1734
+ data = pd.read_json(target_path_file, lines=True)
1735
+ data["subset"] = subset
1736
+ data = data[data["linguistic_phenomenon"] == "presuppositions"]
1737
+ datasets.append(data)
1738
+ dataset = pd.concat(datasets)
1685
1739
  return dataset
1686
1740
 
1687
1741
  def get_instances(self, output_path) -> List[Instance]:
1688
1742
  data = self.download_dataset(output_path)
1689
1743
  outputs = []
1690
1744
  for _, row in data.iterrows():
1691
- passage = "{question}\nPernyataan: {text}\n{instruction}".format(
1692
- question=self.prompt[self.language]["question"].format(row["question_translated"]),
1693
- text=row["text"],
1694
- instruction=self.prompt[self.language]["instruction"].format(row["choices_translated"]),
1695
- )
1696
- input = Input(text=passage)
1697
-
1698
- # Split "True or False" into ["True", "or", "False"]
1699
- choices = row["choices"].split()
1700
- choices_translated = row["choices_translated"].split()
1701
- label2choice = {
1702
- choices[0]: choices_translated[0],
1703
- choices[2]: choices_translated[2],
1704
- }
1705
- references = [
1706
- Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1707
- ]
1745
+ passage = None
1746
+ references = []
1747
+
1748
+ if row["subset"] == "single":
1749
+ question = self.prompt_components["single_question"]
1750
+ text_noun = self.prompt_components["text_noun"]
1751
+ instruction = self.prompt_components["single_instruction"]
1752
+
1753
+ passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1754
+ question=question.format(row["question_translated"]),
1755
+ text_noun=text_noun,
1756
+ text=row["text"],
1757
+ instruction=instruction.format(row["choices_translated"]),
1758
+ )
1759
+ # Split "True or False" into ["True", "or", "False"]
1760
+ choices = row["choices"].split()
1761
+ choices_translated = row["choices_translated"].split()
1762
+ label2choice = {
1763
+ choices[0]: choices_translated[0],
1764
+ choices[2]: choices_translated[2],
1765
+ }
1766
+ references.append(
1767
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1768
+ )
1769
+
1770
+ elif row["subset"] == "pair":
1771
+ premise_noun = self.prompt_components["premise_noun"]
1772
+ question = self.prompt_components["pair_question"]
1773
+ conclusion_noun = self.prompt_components["conclusion_noun"]
1774
+ instruction = self.prompt_components["pair_instruction"]
1775
+ label = self.prompt_components[str(row["label"])]
1776
+
1777
+ passage = (
1778
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
1779
+ premise_noun=premise_noun,
1780
+ premise=row["text"],
1781
+ question=question,
1782
+ conclusion_noun=conclusion_noun,
1783
+ conclusion=row["conclusion"],
1784
+ instruction=instruction,
1785
+ )
1786
+ )
1787
+
1788
+ references.append(
1789
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
1790
+ )
1791
+
1792
+ input = Input(text=str(passage))
1708
1793
  instance = Instance(
1709
1794
  input=input,
1710
1795
  references=references,
@@ -1714,17 +1799,25 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
1714
1799
  return outputs
1715
1800
 
1716
1801
 
1717
- # 2.2 Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
1718
- class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
1802
+ # 2.2 Pragmatics: LINDSEA Scalar Implicatures
1803
+ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
1719
1804
  """
1720
- The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
1805
+ The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
1721
1806
  The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1722
- of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
1807
+ of quality control.
1723
1808
 
1724
- The sentence-pair pragmatic reasoning dataset involves questions targeting whether a conclusion can be drawn
1809
+ The scalar implicatures dataset involves two formats: single and pair sentences.
1810
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
1811
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
1725
1812
  from another sentence.
1726
1813
 
1727
- The models are prompted using the following general format:
1814
+ For the single format, the models are prompted using the following general format:
1815
+
1816
+ Is the following statement true or false?
1817
+ Statement: <sentence>
1818
+ Answer only with True or False.
1819
+
1820
+ For the pair format, the models are prompted using the following general format:
1728
1821
 
1729
1822
  Situation: <premise>
1730
1823
  Given this situation, is the following statement true or false?
@@ -1750,45 +1843,101 @@ class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
1750
1843
  }
1751
1844
  """
1752
1845
 
1753
- name = "lindsea_pragmatic_reasoning_pair"
1754
- description = "LINDSEA pragmatic reasoning sentence pair task"
1755
- tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
1846
+ name = "lindsea_pragmatics_scalar_implicatures"
1847
+ description = "LINDSEA scalar implicatures task"
1848
+ tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
1756
1849
 
1757
- def __init__(self, language: str):
1850
+ def __init__(self, language: str, subset: str):
1758
1851
  super().__init__()
1759
1852
  self.language = language
1760
- self.prompt = {
1853
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
1854
+ self.language_to_prompt_components = {
1761
1855
  "id": {
1762
- "question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1763
- "instruction": "Jawablah dengan Benar atau Salah saja.",
1764
- True: "Benar",
1765
- False: "Salah",
1856
+ "text_noun": "Pernyataan",
1857
+ "premise_noun": "Situasi",
1858
+ "conclusion_noun": "Pernyataan",
1859
+ "single_question": "Apakah pernyataan berikut ini {}?",
1860
+ "single_instruction": "Jawablah dengan {} saja.",
1861
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1862
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
1863
+ "True": "Benar",
1864
+ "False": "Salah",
1766
1865
  },
1767
1866
  }
1867
+ if self.language not in self.language_to_prompt_components.keys():
1868
+ raise Exception(
1869
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1870
+ )
1871
+ else:
1872
+ self.prompt_components = self.language_to_prompt_components[self.language]
1768
1873
 
1769
1874
  def download_dataset(self, output_path: str):
1770
1875
  BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1771
- URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_pair.jsonl"
1772
- file = "pragmatic_reasoning_pair"
1773
- target_path_file = os.path.join(output_path, file)
1774
- ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1775
- dataset = pd.read_json(target_path_file, lines=True)
1876
+ datasets = []
1877
+ for subset in self.subsets:
1878
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
1879
+ file = f"pragmatic_reasoning_{subset}.jsonl"
1880
+ target_path_file = os.path.join(output_path, file)
1881
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1882
+ data = pd.read_json(target_path_file, lines=True)
1883
+ data["subset"] = subset
1884
+ data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
1885
+ datasets.append(data)
1886
+ dataset = pd.concat(datasets)
1776
1887
  return dataset
1777
1888
 
1778
1889
  def get_instances(self, output_path) -> List[Instance]:
1779
1890
  data = self.download_dataset(output_path)
1780
1891
  outputs = []
1781
1892
  for _, row in data.iterrows():
1782
- passage = "Situasi: {premise}\n{question}\nPernyataan: {conclusion}\n{instruction}".format(
1783
- premise=row["text"],
1784
- question=self.prompt[self.language]["question"],
1785
- conclusion=row["conclusion"],
1786
- instruction=self.prompt[self.language]["instruction"],
1787
- )
1788
- input = Input(text=passage)
1789
- references = [
1790
- Reference(Output(text=self.prompt[self.language][row["label"]]), tags=[CORRECT_TAG]),
1791
- ]
1893
+ passage = None
1894
+ references = []
1895
+
1896
+ if row["subset"] == "single":
1897
+ question = self.prompt_components["single_question"]
1898
+ text_noun = self.prompt_components["text_noun"]
1899
+ instruction = self.prompt_components["single_instruction"]
1900
+
1901
+ passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1902
+ question=question.format(row["question_translated"]),
1903
+ text_noun=text_noun,
1904
+ text=row["text"],
1905
+ instruction=instruction.format(row["choices_translated"]),
1906
+ )
1907
+ # Split "True or False" into ["True", "or", "False"]
1908
+ choices = row["choices"].split()
1909
+ choices_translated = row["choices_translated"].split()
1910
+ label2choice = {
1911
+ choices[0]: choices_translated[0],
1912
+ choices[2]: choices_translated[2],
1913
+ }
1914
+ references.append(
1915
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1916
+ )
1917
+
1918
+ elif row["subset"] == "pair":
1919
+ premise_noun = self.prompt_components["premise_noun"]
1920
+ question = self.prompt_components["pair_question"]
1921
+ conclusion_noun = self.prompt_components["conclusion_noun"]
1922
+ instruction = self.prompt_components["pair_instruction"]
1923
+ label = self.prompt_components[str(row["label"])]
1924
+
1925
+ passage = (
1926
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
1927
+ premise_noun=premise_noun,
1928
+ premise=row["text"],
1929
+ question=question,
1930
+ conclusion_noun=conclusion_noun,
1931
+ conclusion=row["conclusion"],
1932
+ instruction=instruction,
1933
+ )
1934
+ )
1935
+
1936
+ references.append(
1937
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
1938
+ )
1939
+
1940
+ input = Input(text=str(passage))
1792
1941
  instance = Instance(
1793
1942
  input=input,
1794
1943
  references=references,
@@ -3,7 +3,7 @@ import os
3
3
  from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
- from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
6
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
7
7
 
8
8
 
9
9
  class SelfInstructScenario(Scenario):