crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -9,6 +9,7 @@ Usage:
9
9
  """
10
10
 
11
11
  import argparse
12
+ import dataclasses
12
13
  import os
13
14
  import datetime
14
15
  import urllib.parse
@@ -31,18 +32,26 @@ from helm.common.general import (
31
32
  )
32
33
  from helm.common.codec import from_json
33
34
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
34
- from helm.benchmark.scenarios.scenario import ScenarioSpec
35
+ from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
35
36
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
36
37
  from helm.benchmark.metrics.metric_name import MetricName
37
- from helm.benchmark.metrics.metric import get_all_stats_by_name
38
+ from helm.benchmark.metrics.metric import (
39
+ MetricInterface,
40
+ MetricMetadata,
41
+ MetricSpec,
42
+ create_metric,
43
+ get_all_stats_by_name,
44
+ )
38
45
  from helm.benchmark.metrics.statistic import Stat, merge_stat
39
46
  from helm.benchmark.run_spec import RunSpec
40
47
  from helm.benchmark.runner import LATEST_SYMLINK
41
48
  from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
42
49
  from helm.benchmark.presentation.schema import (
50
+ MetricGroup,
43
51
  MetricNameMatcher,
44
52
  RunGroup,
45
53
  Field,
54
+ Schema,
46
55
  read_schema,
47
56
  get_default_schema_path,
48
57
  BY_GROUP,
@@ -341,7 +350,7 @@ class Summarizer:
341
350
  release: Optional[str],
342
351
  suites: Optional[List[str]],
343
352
  suite: Optional[str],
344
- schema_path: str,
353
+ schema_path: Optional[str],
345
354
  output_path: str,
346
355
  verbose: bool,
347
356
  num_threads: int,
@@ -376,10 +385,8 @@ class Summarizer:
376
385
  self.verbose: bool = verbose
377
386
  self.num_threads: int = num_threads
378
387
  self.allow_unknown_models: bool = allow_unknown_models
379
-
380
- ensure_directory_exists(self.run_release_path)
381
-
382
- self.schema = read_schema(schema_path)
388
+ self.schema = read_schema(schema_path) if schema_path else Schema()
389
+ self.metric_metadata: List[MetricMetadata] = []
383
390
 
384
391
  def read_run(self, run_path: str) -> Run:
385
392
  """Load the `Run` object from `run_path`."""
@@ -426,6 +433,8 @@ class Summarizer:
426
433
 
427
434
  def read_runs_for_suite(self, suite, run_suite_path):
428
435
  """Load the runs in the run suite path."""
436
+ if not os.path.exists(run_suite_path):
437
+ raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
429
438
  # run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
430
439
  # so filter them out.
431
440
  run_dir_names = sorted(
@@ -509,6 +518,150 @@ class Summarizer:
509
518
  model_field_dicts.append(asdict_without_nones(model_field))
510
519
  return model_field_dicts
511
520
 
521
+ def get_metric_metadata(self) -> List[MetricMetadata]:
522
+ if self.metric_metadata:
523
+ return self.metric_metadata
524
+ metric_specs: List[MetricSpec] = []
525
+ for run in self.runs:
526
+ metric_specs.extend(run.run_spec.metric_specs)
527
+ metric_specs = list(set(metric_specs))
528
+ metric_name_to_metadata: Dict[str, MetricMetadata] = {}
529
+ for metric_spec in metric_specs:
530
+ try:
531
+ metric: MetricInterface = create_metric(metric_spec)
532
+ metric_metadata_list = metric.get_metadata()
533
+ for metric_metadata in metric_metadata_list:
534
+ metric_name_to_metadata[metric_metadata.name] = metric_metadata
535
+ except NotImplementedError:
536
+ pass
537
+ except (ModuleNotFoundError, AttributeError, TypeError):
538
+ pass
539
+
540
+ run_stat_names: Set[str] = set()
541
+ for run in self.runs:
542
+ for stat in run.stats:
543
+ run_stat_names.add(stat.name.name)
544
+
545
+ metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
546
+ for metric_name_to_prune in metric_names_to_prune:
547
+ del metric_name_to_metadata[metric_name_to_prune]
548
+ self.metric_metadata = list(metric_name_to_metadata.values())
549
+ return self.metric_metadata
550
+
551
+ def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
552
+ return Field(
553
+ name=metric_metadata.name,
554
+ display_name=metric_metadata.display_name,
555
+ short_display_name=metric_metadata.short_display_name,
556
+ description=metric_metadata.description,
557
+ lower_is_better=metric_metadata.lower_is_better,
558
+ )
559
+
560
+ def auto_generate_metric_fields(self) -> List[Field]:
561
+ return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
562
+
563
+ def auto_generate_metric_groups(self) -> List[MetricGroup]:
564
+ metric_groups = [
565
+ MetricGroup(
566
+ name="main_metric",
567
+ display_name="Main Metric",
568
+ description="Main Metric",
569
+ metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
570
+ )
571
+ ]
572
+ metric_group_to_metrics: Dict[str, List[str]] = {}
573
+ for metric_metadata in self.metric_metadata:
574
+ if metric_metadata.group:
575
+ if metric_metadata.group not in metric_group_to_metrics:
576
+ metric_group_to_metrics[metric_metadata.group] = []
577
+ metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
578
+ for metric_group, metric_names in metric_group_to_metrics.items():
579
+ display_name = metric_group.replace("_", " ").capitalize()
580
+ metric_groups.append(
581
+ MetricGroup(
582
+ name=metric_group,
583
+ # TODO: Make display_name and description nicer
584
+ display_name=display_name,
585
+ description=display_name,
586
+ aggregation_strategies=[],
587
+ metrics=[
588
+ MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
589
+ ],
590
+ )
591
+ )
592
+ return metric_groups
593
+
594
+ def get_scenario_metadata(self) -> List[ScenarioMetadata]:
595
+ scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
596
+ scenario_specs = list(set(scenario_specs))
597
+ scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
598
+ for scenario_spec in scenario_specs:
599
+ try:
600
+ scenario: Scenario = create_scenario(scenario_spec)
601
+ scenario_metadata = scenario.get_metadata()
602
+ scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
603
+ except NotImplementedError:
604
+ pass
605
+ except (ModuleNotFoundError, AttributeError, TypeError):
606
+ pass
607
+
608
+ run_groups: Set[str] = set()
609
+ for run in self.runs:
610
+ for run_group in run.run_spec.groups:
611
+ run_groups.add(run_group)
612
+
613
+ scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
614
+ for scenario_name_to_prune in scenario_names_to_prune:
615
+ del scenario_name_to_metadata[scenario_name_to_prune]
616
+ return list(scenario_name_to_metadata.values())
617
+
618
+ def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
619
+ metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
620
+ return RunGroup(
621
+ name=scenario_metadata.name,
622
+ display_name=scenario_metadata.display_name,
623
+ short_display_name=scenario_metadata.short_display_name,
624
+ description=scenario_metadata.description,
625
+ metric_groups=metric_group_names,
626
+ environment={
627
+ "main_name": scenario_metadata.main_metric,
628
+ "main_split": scenario_metadata.main_split,
629
+ },
630
+ taxonomy=scenario_metadata.taxonomy,
631
+ )
632
+
633
+ def auto_generate_all_scenarios_run_group(self) -> RunGroup:
634
+ return RunGroup(
635
+ name="all_scenarios",
636
+ display_name="All Scenarios",
637
+ description="All scenarios",
638
+ category="Scenario Groups",
639
+ subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
640
+ )
641
+
642
+ def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
643
+ return [
644
+ self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
645
+ ]
646
+
647
+ def fix_up_schema(self) -> None:
648
+ # if not self.schema.run_groups:
649
+ if not self.schema.metrics:
650
+ self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
651
+ # Can only auto-generate metric groups if metrics were also auto-generated
652
+ # because auto_generate_metric_groups() requires self.metric_metadata()
653
+ # which is populated by auto_generate_metric_fields()
654
+ if not self.schema.metric_groups:
655
+ self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
656
+ if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
657
+ self.schema = dataclasses.replace(
658
+ self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
659
+ )
660
+ if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
661
+ self.schema = dataclasses.replace(
662
+ self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
663
+ )
664
+
512
665
  def write_schema(self) -> None:
513
666
  """Write the schema file to benchmark_output so the frontend knows about it."""
514
667
  # Manually add the model metadata to the schema.json, where the frontend expects it.
@@ -1070,7 +1223,8 @@ class Summarizer:
1070
1223
  is_scenario_table=False,
1071
1224
  aggregation_strategies=aggregate_strategies,
1072
1225
  )
1073
- tables.append(table)
1226
+ if len(table.header) > 1:
1227
+ tables.append(table)
1074
1228
  return tables
1075
1229
 
1076
1230
  def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
@@ -1213,14 +1367,16 @@ class Summarizer:
1213
1367
  """Run the entire summarization pipeline."""
1214
1368
  self.read_runs()
1215
1369
  self.group_runs()
1216
- self.check_metrics_defined()
1217
1370
 
1218
- self.write_run_display_json(skip_completed)
1371
+ ensure_directory_exists(self.run_release_path)
1219
1372
 
1220
1373
  # Must happen after self.read_runs()
1221
1374
  # because it uses self.runs
1375
+ self.fix_up_schema()
1376
+ self.check_metrics_defined()
1222
1377
  self.write_schema()
1223
1378
 
1379
+ self.write_run_display_json(skip_completed)
1224
1380
  self.write_executive_summary()
1225
1381
  self.write_runs()
1226
1382
  self.write_run_specs()
@@ -1254,7 +1410,15 @@ def summarize(args):
1254
1410
  else:
1255
1411
  raise ValueError("Exactly one of --release or --suite must be specified.")
1256
1412
 
1257
- schema_path = args.schema_path if args.schema_path else get_default_schema_path()
1413
+ schema_path: Optional[str]
1414
+ if args.auto_generate_schema:
1415
+ if args.schema_path:
1416
+ raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
1417
+ schema_path = None
1418
+ elif args.schema_path:
1419
+ schema_path = args.schema_path
1420
+ else:
1421
+ schema_path = get_default_schema_path()
1258
1422
 
1259
1423
  register_builtin_configs_from_helm_package()
1260
1424
  register_configs_from_directory(args.local_path)
@@ -1346,6 +1510,11 @@ def main():
1346
1510
  default=None,
1347
1511
  help="PATH to a YAML file to customize logging",
1348
1512
  )
1513
+ parser.add_argument(
1514
+ "--auto-generate-schema",
1515
+ action="store_true",
1516
+ help="EXPERIMENTAL: Auto-generate schema",
1517
+ )
1349
1518
  args = parser.parse_args()
1350
1519
  setup_default_logging(args.log_config)
1351
1520
  summarize(args)
@@ -0,0 +1,20 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class TaxonomyInfo:
7
+ # Task (e.g., question answering)
8
+ task: Optional[str] = None
9
+
10
+ # Domain - genre (e.g., Wikipedia)
11
+ what: Optional[str] = None
12
+
13
+ # Domain - when it was written (e.g., 2010s)
14
+ when: Optional[str] = None
15
+
16
+ # Domain - demographics (e.g., web users)
17
+ who: Optional[str] = None
18
+
19
+ # Language (e.g., English)
20
+ language: Optional[str] = None
helm/benchmark/run.py CHANGED
@@ -37,7 +37,7 @@ def run_entries_to_run_specs(
37
37
  run_specs: List[RunSpec] = []
38
38
  for entry in run_entries:
39
39
  # Filter by priority
40
- if priority is not None and entry.priority > priority:
40
+ if priority is not None and entry.priority is not None and entry.priority > priority:
41
41
  continue
42
42
 
43
43
  for run_spec in construct_run_specs(parse_object_spec(entry.description)):
@@ -1484,6 +1484,8 @@ class OutputFormatInstructions(RunExpander):
1484
1484
  instructions = "Answer with only a single letter. Do not include a period in your answer."
1485
1485
  elif self.scenario == "mcqa_only_last_question":
1486
1486
  instructions = "Answer only the last question with only a single letter."
1487
+ elif self.scenario == "arabic_mcqa":
1488
+ instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
1487
1489
  else:
1488
1490
  instructions = "Answer with only a single letter."
1489
1491
  elif run_spec.adapter_spec.method == ADAPT_GENERATION:
@@ -1525,6 +1527,8 @@ class OutputFormatInstructions(RunExpander):
1525
1527
  "Answer only the last question with a short answer. "
1526
1528
  "Avoid extra, unnecessary information in the answer."
1527
1529
  )
1530
+ elif self.scenario == "arabic_mcqa":
1531
+ instructions = "اكتب حرف الإجابة فقط، دون أي إضافات أخرى."
1528
1532
  else:
1529
1533
  raise ValueError(f"Unknown scenario {self.scenario}")
1530
1534
  elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
@@ -4,25 +4,37 @@ EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
4
4
 
5
5
  from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
6
6
  from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
7
- from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
7
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
8
+ from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
9
+ from helm.benchmark.metrics.metric import MetricSpec
8
10
  from helm.benchmark.run_spec import RunSpec, run_spec_function
9
11
  from helm.benchmark.scenarios.scenario import ScenarioSpec
10
12
 
11
13
 
14
+ _ARABIC_REFERENCE_PREFIX_CHARACTERS = ["أ", "ب", "ج", "د", "هـ"]
15
+ _ARABIC_OUTPUT_MAPPING_PATTERN = "(أ|ب|ج|د|هـ)"
16
+
17
+
12
18
  @run_spec_function("arabic_mmlu")
13
- def get_arabic_mmlu_spec() -> RunSpec:
19
+ def get_arabic_mmlu_spec(subset: str) -> RunSpec:
14
20
  """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
15
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario")
21
+
22
+ scenario_spec = ScenarioSpec(
23
+ class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario", args={"subset": subset}
24
+ )
16
25
 
17
26
  adapter_spec = get_multiple_choice_adapter_spec(
18
27
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
19
- instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
20
- input_noun="Question",
21
- output_noun="Answer",
28
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
29
+ input_noun="السؤال",
30
+ output_noun="الإجابة",
31
+ max_tokens=100,
32
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
33
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
22
34
  )
23
35
 
24
36
  return RunSpec(
25
- name="arabic_mmlu",
37
+ name=f"arabic_mmlu:subset={subset}",
26
38
  scenario_spec=scenario_spec,
27
39
  adapter_spec=adapter_spec,
28
40
  metric_specs=get_exact_match_metric_specs(),
@@ -39,9 +51,12 @@ def get_alghafa_spec(subset: str) -> RunSpec:
39
51
 
40
52
  adapter_spec = get_multiple_choice_adapter_spec(
41
53
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
42
- instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
43
- input_noun="Question",
44
- output_noun="Answer",
54
+ instructions="الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح", # noqa: E501
55
+ input_noun="السؤال",
56
+ output_noun="الإجابة",
57
+ max_tokens=100,
58
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
59
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
45
60
  )
46
61
 
47
62
  return RunSpec(
@@ -54,20 +69,129 @@ def get_alghafa_spec(subset: str) -> RunSpec:
54
69
 
55
70
 
56
71
  @run_spec_function("aratrust")
57
- def get_aratrust_spec() -> RunSpec:
72
+ def get_aratrust_spec(category: str) -> RunSpec:
58
73
  """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
59
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario")
74
+ scenario_spec = ScenarioSpec(
75
+ class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario",
76
+ args={"category": category},
77
+ )
60
78
 
61
79
  adapter_spec = get_generation_adapter_spec(
62
- instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.", # noqa: E501
63
- input_noun="Question",
64
- output_noun="Answer",
80
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة: أ، ب أو ج", # noqa: E501
81
+ input_noun="السؤال",
82
+ output_noun="الإجابة",
83
+ max_tokens=100,
65
84
  )
66
85
 
67
86
  return RunSpec(
68
- name="aratrust",
87
+ name=f"aratrust:category={category}",
69
88
  scenario_spec=scenario_spec,
70
89
  adapter_spec=adapter_spec,
71
90
  metric_specs=get_exact_match_metric_specs(),
72
91
  groups=["aratrust"],
73
92
  )
93
+
94
+
95
+ @run_spec_function("alrage")
96
+ def get_alrage_spec() -> RunSpec:
97
+ """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
98
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.alrage_scenario.ALRAGEScenario")
99
+
100
+ adapter_spec = get_generation_adapter_spec(
101
+ instructions="بناءً على السياقات المقترحة التالية، اجب عن السؤال التالي", # noqa: E501
102
+ input_noun="السؤال",
103
+ output_noun="الإجابة",
104
+ max_tokens=100,
105
+ )
106
+
107
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.alrage_annotator.ALRAGEAnnotator")]
108
+
109
+ metric_specs = [
110
+ MetricSpec(class_name="helm.benchmark.metrics.alrage_metric.ALRAGEMetric")
111
+ ] + get_basic_metric_specs([])
112
+
113
+ return RunSpec(
114
+ name="alrage",
115
+ scenario_spec=scenario_spec,
116
+ adapter_spec=adapter_spec,
117
+ annotators=annotator_specs,
118
+ metric_specs=metric_specs,
119
+ groups=["alrage"],
120
+ )
121
+
122
+
123
+ @run_spec_function("madinah_qa")
124
+ def get_madinah_qa_spec(subset: str) -> RunSpec:
125
+ scenario_spec = ScenarioSpec(
126
+ class_name="helm.benchmark.scenarios.madinah_qa_scenario.MadinahQAScenario", args={"subset": subset}
127
+ )
128
+
129
+ adapter_spec = get_multiple_choice_adapter_spec(
130
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
131
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
132
+ input_noun="السؤال",
133
+ output_noun="الإجابة",
134
+ max_tokens=100,
135
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
136
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
137
+ )
138
+
139
+ return RunSpec(
140
+ name=f"madinah_qa:subset={subset}",
141
+ scenario_spec=scenario_spec,
142
+ adapter_spec=adapter_spec,
143
+ metric_specs=get_exact_match_metric_specs(),
144
+ groups=["madinah_qa"],
145
+ )
146
+
147
+
148
+ @run_spec_function("mbzuai_human_translated_arabic_mmlu")
149
+ def get_arabic_mmmlu_spec(subject: str) -> RunSpec:
150
+ scenario_spec = ScenarioSpec(
151
+ class_name="helm.benchmark.scenarios.mbzuai_human_translated_arabic_mmlu.MBZUAIHumanTranslatedArabicMMLUScenario",
152
+ args={"subject": subject},
153
+ )
154
+
155
+ adapter_spec = get_multiple_choice_adapter_spec(
156
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
157
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
158
+ input_noun="السؤال",
159
+ output_noun="الإجابة",
160
+ max_tokens=100,
161
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
162
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
163
+ )
164
+
165
+ return RunSpec(
166
+ name=f"mbzuai_human_translated_arabic_mmlu:subject={subject}",
167
+ scenario_spec=scenario_spec,
168
+ adapter_spec=adapter_spec,
169
+ metric_specs=get_exact_match_metric_specs(),
170
+ groups=["mbzuai_human_translated_arabic_mmlu"],
171
+ )
172
+
173
+
174
+ @run_spec_function("arabic_exams")
175
+ def get_arabic_exams_spec(subject: str) -> RunSpec:
176
+ scenario_spec = ScenarioSpec(
177
+ class_name="helm.benchmark.scenarios.arabic_exams_scenario.ArabicEXAMSScenario",
178
+ args={"subject": subject},
179
+ )
180
+
181
+ adapter_spec = get_multiple_choice_adapter_spec(
182
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
183
+ instructions="السؤال التالي هو سؤال متعدد الإختيارات. اختر الإجابة الصحيحة", # noqa: E501
184
+ input_noun="السؤال",
185
+ output_noun="الإجابة",
186
+ max_tokens=100,
187
+ reference_prefix_characters=_ARABIC_REFERENCE_PREFIX_CHARACTERS,
188
+ output_mapping_pattern=_ARABIC_OUTPUT_MAPPING_PATTERN,
189
+ )
190
+
191
+ return RunSpec(
192
+ name=f"arabic_exams:subject={subject}",
193
+ scenario_spec=scenario_spec,
194
+ adapter_spec=adapter_spec,
195
+ metric_specs=get_exact_match_metric_specs(),
196
+ groups=["arabic_exams"],
197
+ )
@@ -7,7 +7,7 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
7
7
 
8
8
  @run_spec_function("bluex")
9
9
  def get_bluex_spec() -> RunSpec:
10
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEX_Scenario", args={})
10
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEXScenario", args={})
11
11
 
12
12
  adapter_spec = get_multiple_choice_adapter_spec(
13
13
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
@@ -753,12 +753,12 @@ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str =
753
753
  )
754
754
 
755
755
  return RunSpec(
756
- name=f"summarization_xsum:temperature={temperature},device={device}",
756
+ name=f"summarization_xsum_sampled:temperature={temperature},device={device}",
757
757
  scenario_spec=scenario_spec,
758
758
  adapter_spec=adapter_spec,
759
759
  metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
760
760
  + get_generative_harms_metric_specs(),
761
- groups=["summarization_xsum"],
761
+ groups=["summarization_xsum_sampled"],
762
762
  )
763
763
 
764
764
 
@@ -39,12 +39,12 @@ def _get_long_context_multiple_choice_adapter_spec(max_tokens: int) -> AdapterSp
39
39
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
40
40
  global_prefix="",
41
41
  global_suffix="",
42
- instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice.", # noqa: E501
42
+ instructions="Read the passage and answer the following question. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n\n", # noqa: E501
43
43
  input_prefix="",
44
44
  input_suffix="\n",
45
45
  reference_prefix="A. ",
46
46
  reference_suffix="\n",
47
- output_prefix="",
47
+ output_prefix="\nAnswer the question above based on the passage. Respond with only a single letter corresponding to your choice. Do not include a period in your answer.\n", # noqa: E501
48
48
  output_suffix="",
49
49
  instance_prefix="",
50
50
  max_train_instances=0,
File without changes