crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  Input,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -102,3 +104,14 @@ class LegalSupportScenario(Scenario):
102
104
  instances.append(instance)
103
105
 
104
106
  return instances
107
+
108
+ def get_metadata(self) -> ScenarioMetadata:
109
+ return ScenarioMetadata(
110
+ name="legal_support",
111
+ display_name="LegalSupport",
112
+ description="Scenario introduced in this work to measure fine-grained legal reasoning "
113
+ "through reverse entailment.",
114
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
115
+ main_metric="quasi_exact_match",
116
+ main_split="test",
117
+ )
@@ -5,6 +5,7 @@ import datasets
5
5
  from pathlib import Path
6
6
  from typing import List, Dict
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
9
10
  from helm.benchmark.scenarios.scenario import (
10
11
  Scenario,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
15
16
  TEST_SPLIT,
16
17
  Input,
17
18
  Output,
19
+ ScenarioMetadata,
18
20
  )
19
21
 
20
22
  PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
@@ -144,3 +146,20 @@ class LegalBenchScenario(Scenario):
144
146
  instances.append(instance)
145
147
 
146
148
  return instances
149
+
150
+ def get_metadata(self) -> ScenarioMetadata:
151
+ return ScenarioMetadata(
152
+ name=self.name,
153
+ display_name="LegalBench",
154
+ description="LegalBench is a large collaboratively constructed benchmark of legal reasoning "
155
+ "tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).",
156
+ taxonomy=TaxonomyInfo(
157
+ task="multiple-choice question answering",
158
+ what="public legal and admininstrative documents, manually " "constructed questions",
159
+ when="before 2023",
160
+ who="lawyers",
161
+ language="English",
162
+ ),
163
+ main_metric="quasi_exact_match",
164
+ main_split="test",
165
+ )
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
16
16
  TEST_SPLIT,
17
17
  Input,
18
18
  Output,
19
+ ScenarioMetadata,
19
20
  )
20
21
 
21
22
  ECTHR_A = "ecthr_a"
@@ -261,3 +262,13 @@ class LexGLUEScenario(Scenario):
261
262
  for subset in self.subsets:
262
263
  instances.extend(self.get_instances_for_subset(subset, output_path))
263
264
  return instances
265
+
266
+ def get_metadata(self) -> ScenarioMetadata:
267
+ return ScenarioMetadata(
268
+ name="lex_glue",
269
+ display_name="LexGLUE",
270
+ description="A Benchmark Dataset for Legal Language Understanding in English",
271
+ taxonomy=None,
272
+ main_metric="classification_macro_f1",
273
+ main_split="test",
274
+ )
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
16
16
  TEST_SPLIT,
17
17
  Output,
18
18
  Input,
19
+ ScenarioMetadata,
19
20
  )
20
21
 
21
22
 
@@ -466,3 +467,13 @@ class LEXTREMEScenario(Scenario):
466
467
  for subset in self.subsets:
467
468
  instances.extend(self.get_instances_for_subset(subset, output_path))
468
469
  return instances
470
+
471
+ def get_metadata(self) -> ScenarioMetadata:
472
+ return ScenarioMetadata(
473
+ name="lextreme",
474
+ display_name="LEXTREME",
475
+ description="A Multilingual Legal Benchmark for Natural Language Understanding",
476
+ taxonomy=None,
477
+ main_metric="classification_macro_f1",
478
+ main_split="test",
479
+ )
@@ -0,0 +1,288 @@
1
+ """Cultural alignment evaluation scenario based on Vietnam World Values Survey responses."""
2
+
3
+ import os
4
+ import json
5
+ import random
6
+ from typing import List
7
+ from datasets import load_dataset
8
+ from huggingface_hub import snapshot_download
9
+
10
+ from helm.common.hierarchical_logger import hlog, hwarn
11
+ from helm.benchmark.scenarios.scenario import (
12
+ Scenario,
13
+ Instance,
14
+ Reference,
15
+ TEST_SPLIT,
16
+ CORRECT_TAG,
17
+ Input,
18
+ Output,
19
+ )
20
+
21
+ SUPPORTED_LANGUAGES = ["en", "vi"]
22
+
23
+
24
+ class CulturalValueUnderstandingWVSScenario(Scenario):
25
+ """Cultural values understanding evaluation based on Vietnam World Values Survey responses."""
26
+
27
+ name = "cultural_value_understanding_wvs"
28
+ description = "Evaluates model understanding of cultural values from WVS Wave 7"
29
+ tags = ["cultural_value_understanding"]
30
+
31
+ def __init__(
32
+ self,
33
+ language: str,
34
+ personas_filename: str = "vn_personas_with_filtered_responses.json",
35
+ questions_filename: str = "wvs_questions_dump.en.json",
36
+ repo_id: str = "stair-lab/cultural-alignment-dataset",
37
+ num_personas: int = 10,
38
+ num_question_variants: int = 2,
39
+ include_few_shot_examples: bool = False,
40
+ ):
41
+ super().__init__()
42
+ self.language = language
43
+ self.personas_filename = personas_filename
44
+ self.questions_filename = questions_filename
45
+ self.repo_id = repo_id
46
+ self.num_personas = 10 if isinstance(num_personas, str) else num_personas
47
+ self.num_question_variants = min(num_question_variants, 4)
48
+ self.include_few_shot_examples = include_few_shot_examples
49
+
50
+ if self.language not in SUPPORTED_LANGUAGES:
51
+ raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
52
+
53
+ def get_few_shot_examples(self) -> str:
54
+ """Return a few-shot learning examples formatted as a string."""
55
+ examples = """Here are some examples:
56
+
57
+ Example 1:
58
+ You are a 35 year old male from Vietnam. You have higher education and consider yourself middle class. You are married.
59
+
60
+ How important is family in your life?
61
+
62
+ 1. Very important
63
+ 2. Rather important
64
+ 3. Not very important
65
+ 4. Not at all important
66
+
67
+ Please select the option number that best represents your view.
68
+ 1
69
+
70
+ Example 2:
71
+ You are a 28 year old female from Vietnam. You have middle education and consider yourself working class. You are single.
72
+
73
+ Would you say that most people can be trusted or that you need to be very careful in dealing with people?
74
+
75
+ 1. Most people can be trusted
76
+ 2. Need to be very careful
77
+
78
+ Please select the option number that best represents your view.
79
+ 2
80
+
81
+ Example 3:
82
+ You are a 50 year old male from Vietnam. You have lower education and consider yourself lower class. You are married.
83
+
84
+ Do you think that homosexuality is justifiable?
85
+
86
+ 1. Never justifiable
87
+ 2. Rarely justifiable
88
+ 3. Sometimes justifiable
89
+ 4. Always justifiable
90
+
91
+ Please select the option number that best represents your view.
92
+ 1
93
+
94
+ Now answer the following question:
95
+ """ # noqa: E501
96
+ return examples
97
+
98
+ def get_instances(self, output_path: str) -> List[Instance]:
99
+ """Generate test instances from Vietnam personas and WVS questions."""
100
+ instances: List[Instance] = []
101
+
102
+ # Download files from Hugging Face Hub
103
+ repo_local_path = snapshot_download(
104
+ repo_id=self.repo_id, repo_type="dataset", revision="fe54b6f5d75cfca5377707cd7199e39f517e3a1f"
105
+ )
106
+
107
+ # Load the downloaded files
108
+ with open(os.path.join(repo_local_path, self.personas_filename), "r", encoding="utf-8") as f:
109
+ personas = json.load(f)
110
+
111
+ with open(os.path.join(repo_local_path, self.questions_filename), "r", encoding="utf-8") as f:
112
+ questions = json.load(f)
113
+
114
+ # Get few-shot examples
115
+ few_shot_examples = self.get_few_shot_examples() if self.include_few_shot_examples else ""
116
+
117
+ # Sample personas
118
+ sampled_personas = random.sample(personas, min(self.num_personas, len(personas)))
119
+
120
+ # Create instances for each persona and question
121
+ for persona in sampled_personas:
122
+ # Get demographic info for persona description
123
+ persona_desc = (
124
+ f"You are a {persona.get('age', 'adult')} year old {persona.get('sex', 'person')} from Vietnam. "
125
+ )
126
+ persona_desc += f"You have {persona.get('education', 'some')} education and consider yourself {persona.get('social_class', 'middle class')}. " # noqa: E501
127
+ persona_desc += f"You are {persona.get('marital_status', 'single')}."
128
+
129
+ # Process each question this persona answered
130
+ for qid, human_response in persona.get("responses", {}).items():
131
+ # Skip if no human response or if it's 0 (which might be a "Don't know" response)
132
+ if human_response is None:
133
+ continue
134
+
135
+ # Convert human_response to int (if possible)
136
+ try:
137
+ human_response_int = int(human_response)
138
+ except (ValueError, TypeError):
139
+ # Skip if human_response can't be converted to int
140
+ continue
141
+
142
+ # Get question info
143
+ question_data = questions.get(qid, {})
144
+ if not question_data:
145
+ continue
146
+
147
+ # Get options directly from question data
148
+ q_options = question_data.get("options", [])
149
+ if not q_options:
150
+ continue
151
+
152
+ # Skip if human_response is out of range
153
+ if human_response_int < 0 or human_response_int > len(q_options):
154
+ continue
155
+
156
+ # Special handling for "Don't know" or zero responses
157
+ if human_response_int == 0:
158
+ # Some questions might encode "Don't know" as 0
159
+ # Skip for now, or you could add special handling
160
+ continue
161
+
162
+ # Use the predefined question variations
163
+ question_variants = question_data.get("questions", [])
164
+ if not question_variants:
165
+ question_variants = [f"Question {qid}: {question_data.get('description', '')}"]
166
+
167
+ # Use the specified number of variants
168
+ variants_to_use = min(self.num_question_variants, len(question_variants))
169
+ selected_variants = question_variants[:variants_to_use]
170
+
171
+ # Create instances for each selected question variant
172
+ for q_text in selected_variants:
173
+ # Format the prompt with or without few-shot examples
174
+ if self.include_few_shot_examples:
175
+ prompt = f"{few_shot_examples}{persona_desc}\n\n{q_text}\n\n"
176
+ else:
177
+ prompt = f"{persona_desc}\n\n{q_text}\n\n"
178
+
179
+ # Add options from question data - with numbers, not letters
180
+ for i, opt in enumerate(q_options, 1):
181
+ prompt += f"{i}. {opt}\n"
182
+
183
+ prompt += "\nPlease select the option number that best represents your view. Return only the option number. Do not return anything else." # noqa: E501
184
+
185
+ # Create a reference with just the human response number
186
+ # We don't create multiple references, just use the actual human response
187
+ reference = Reference(Output(text=str(human_response_int)), tags=[CORRECT_TAG])
188
+
189
+ # Create the instance
190
+ instance = Instance(
191
+ input=Input(text=prompt),
192
+ references=[reference],
193
+ split=TEST_SPLIT,
194
+ )
195
+ instances.append(instance)
196
+
197
+ if not instances:
198
+ hwarn("No valid instances were created. Check the input data and parameters.")
199
+ else:
200
+ hlog(f"Created {len(instances)} instances for cultural value understanding scenario.")
201
+
202
+ return instances
203
+
204
+
205
+ class SocialNormApplicationNormADScenario(Scenario):
206
+ """Social norms application evaluation based on Vietnam World Values Survey responses."""
207
+
208
+ name = "social_norm_application_normad"
209
+ description = "Evaluates model understanding of social norms from WVS Wave 7"
210
+ tags = ["social_norm_application"]
211
+
212
+ def __init__(
213
+ self,
214
+ language: str,
215
+ ):
216
+ super().__init__()
217
+ self.language = language
218
+
219
+ if self.language not in SUPPORTED_LANGUAGES:
220
+ raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
221
+
222
+ def get_instances(self, output_path: str) -> List[Instance]:
223
+ dataset = load_dataset(
224
+ "akhilayerukola/NormAd", split="train", revision="311049f5610e932792672e4e13e7df62b6290336"
225
+ )
226
+
227
+ instances: List[Instance] = []
228
+
229
+ for item in dataset:
230
+ input_text = (
231
+ f"Assumming that below norms and story are from {item['Country'].capitalize()}:\n"
232
+ f"{item['Background']}\n\n{item['Story']}\n"
233
+ "Return one of the following options: 'yes', 'no', or 'neutral'. Do not return anything else.\n\n"
234
+ )
235
+
236
+ instance = Instance(
237
+ input=Input(text=input_text),
238
+ references=[Reference(Output(text=item["Gold Label"]), tags=[CORRECT_TAG])],
239
+ split=TEST_SPLIT,
240
+ )
241
+ instances.append(instance)
242
+
243
+ return instances
244
+
245
+
246
+ class SocialNormExplanationNormADScenario(Scenario):
247
+ """Social norms explanation evaluation based on Vietnam World Values Survey responses."""
248
+
249
+ name = "social_norm_explanation_normad"
250
+ description = "Evaluates model understanding of social norms from WVS Wave 7"
251
+ tags = ["social_norm_explanation"]
252
+
253
+ def __init__(
254
+ self,
255
+ language: str,
256
+ ):
257
+ super().__init__()
258
+ self.language = language
259
+
260
+ if self.language not in SUPPORTED_LANGUAGES:
261
+ raise ValueError(f"Unsupported language: {self.language}. Supported languages are: {SUPPORTED_LANGUAGES}")
262
+
263
+ def get_instances(self, output_path: str) -> List[Instance]:
264
+ dataset = load_dataset(
265
+ "akhilayerukola/NormAd", split="train", revision="311049f5610e932792672e4e13e7df62b6290336"
266
+ )
267
+
268
+ instances: List[Instance] = []
269
+
270
+ for item in dataset:
271
+ if not item["Explanation"]:
272
+ continue
273
+
274
+ input_text = (
275
+ f"Assumming that below norms and story are from {item['Country'].capitalize()}:\n"
276
+ f"{item['Background']}\n\n{item['Story']}\n"
277
+ f"The answer is {item['Gold Label']}. "
278
+ "Briefly explain the reasoning behind this answer in one or two sentences.\n\n"
279
+ )
280
+
281
+ instance = Instance(
282
+ input=Input(text=input_text),
283
+ references=[Reference(Output(text=item["Explanation"]), tags=[CORRECT_TAG])],
284
+ split=TEST_SPLIT,
285
+ )
286
+ instances.append(instance)
287
+
288
+ return instances
@@ -2,6 +2,7 @@ import os
2
2
  import json
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
13
14
  CORRECT_TAG,
14
15
  PassageQuestionInput,
15
16
  Output,
17
+ ScenarioMetadata,
16
18
  )
17
19
 
18
20
 
@@ -143,3 +145,15 @@ class LSATScenario(Scenario):
143
145
  instances.append(instance)
144
146
 
145
147
  return instances
148
+
149
+ def get_metadata(self) -> ScenarioMetadata:
150
+ return ScenarioMetadata(
151
+ name="lsat_qa",
152
+ display_name="LSAT",
153
+ description="The LSAT benchmark for measuring analytical reasoning on the Law School "
154
+ "Admission Test (LSAT; [Zhong et al., "
155
+ "2021](https://arxiv.org/pdf/2104.06598.pdf)).",
156
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
157
+ main_metric="quasi_exact_match",
158
+ main_split="test",
159
+ )
@@ -0,0 +1,73 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TEST_SPLIT,
12
+ TRAIN_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
17
+
18
+
19
+ class MadinahQAScenario(Scenario):
20
+ """MadinahQA Scenario"""
21
+
22
+ name = "madinah_qa"
23
+ description = "Arabic language competency benchmark"
24
+ tags = ["language", "multiple_choice"]
25
+
26
+ OPTIONS = ["A", "B", "C", "D"]
27
+ HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
28
+ SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"]
29
+
30
+ def __init__(self, subset: str):
31
+ super().__init__()
32
+ subset = subset.replace("_", " ")
33
+ if subset not in self.SUBSETS:
34
+ raise Exception(f"Unknown subset: {subset}")
35
+ self.subset = subset
36
+
37
+ def get_instances(self, output_path: str) -> List[Instance]:
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+ instances: List[Instance] = []
41
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
42
+ "MBZUAI/MadinahQA",
43
+ self.subset,
44
+ revision="62e7c86ac5c07245a5a952722691d77ddb41f695",
45
+ cache_dir=cache_dir,
46
+ )
47
+
48
+ # Read all instances
49
+ for split_name, dataset in dataset_splits.items():
50
+ assert isinstance(dataset, datasets.Dataset)
51
+ for row_index, row in enumerate(dataset):
52
+ input = Input(text=row["Question"])
53
+ references: List[Reference] = []
54
+ correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
55
+ for option_index in range(1, 6):
56
+ column_name = f"Option {option_index}"
57
+ if not row[column_name]:
58
+ continue
59
+ references.append(
60
+ Reference(
61
+ output=Output(text=row[column_name]),
62
+ tags=[CORRECT_TAG] if option_index == correct_option_index else [],
63
+ )
64
+ )
65
+ instance = Instance(
66
+ id=f"id{row_index}",
67
+ input=input,
68
+ references=references,
69
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
70
+ )
71
+ instances.append(instance)
72
+
73
+ return instances
@@ -4,6 +4,7 @@ import typing
4
4
  from typing import Dict, List, Optional
5
5
  from datasets import load_dataset, DatasetDict
6
6
 
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_directory_exists
8
9
  from helm.benchmark.scenarios.scenario import (
9
10
  Scenario,
@@ -14,17 +15,19 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
 
20
22
  def remove_boxed(string: str) -> Optional[str]:
21
- """Source: https://github.com/hendrycks/math
23
+ r"""Source: https://github.com/hendrycks/math
22
24
 
23
- Extract the text within a \\boxed{...} environment.
25
+ Extract the text within a \boxed{...} environment.
24
26
 
25
27
  Example:
26
- >>> remove_boxed(\\boxed{\\frac{2}{3}})
27
- \\frac{2}{3}
28
+ >>> from helm.benchmark.scenarios.math_scenario import * # NOQA
29
+ >>> remove_boxed(r'\boxed{\frac{2}{3}}')
30
+ '\\frac{2}{3}'
28
31
  """
29
32
  left = "\\boxed{"
30
33
  try:
@@ -68,17 +71,17 @@ def last_boxed_only_string(string: str) -> Optional[str]:
68
71
 
69
72
 
70
73
  def _fix_fracs(string: str) -> str:
71
- """Source: https://github.com/hendrycks/math
74
+ r"""Source: https://github.com/hendrycks/math
72
75
 
73
76
  Reformat fractions.
74
77
 
75
78
  Examples:
76
- >>> _fix_fracs("\\frac1b")
77
- \frac{1}{b}
78
- >>> _fix_fracs("\\frac12")
79
- \frac{1}{2}
80
- >>> _fix_fracs("\\frac1{72}")
81
- \frac{1}{72}
79
+ >>> _fix_fracs(r"\frac1b")
80
+ '\\frac{1}{b}'
81
+ >>> _fix_fracs(r"\frac12")
82
+ '\\frac{1}{2}'
83
+ >>> _fix_fracs(r"\frac1{72}")
84
+ '\\frac{1}{72}'
82
85
  """
83
86
  substrs = string.split("\\frac")
84
87
  new_str = substrs[0]
@@ -112,13 +115,13 @@ def _fix_fracs(string: str) -> str:
112
115
 
113
116
 
114
117
  def _fix_a_slash_b(string: str) -> str:
115
- """Source: https://github.com/hendrycks/math
118
+ r"""Source: https://github.com/hendrycks/math
116
119
 
117
120
  Reformat fractions formatted as a/b to \\frac{a}{b}.
118
121
 
119
122
  Example:
120
- >>> _fix_a_slash_b("2/3")
121
- \frac{2}{3}
123
+ >>> _fix_a_slash_b(r"2/3")
124
+ '\\frac{2}{3}'
122
125
  """
123
126
  if len(string.split("/")) != 2:
124
127
  return string
@@ -149,13 +152,13 @@ def _remove_right_units(string: str) -> str:
149
152
 
150
153
 
151
154
  def _fix_sqrt(string: str) -> str:
152
- """Source: https://github.com/hendrycks/math
155
+ r"""Source: https://github.com/hendrycks/math
153
156
 
154
157
  Reformat square roots.
155
158
 
156
159
  Example:
157
- >>> _fix_sqrt("\\sqrt3")
158
- \sqrt{3}
160
+ >>> _fix_sqrt("\\sqrt3")
161
+ '\\sqrt{3}'
159
162
  """
160
163
  if "\\sqrt" not in string:
161
164
  return string
@@ -210,7 +213,7 @@ def _strip_string(string: str) -> str:
210
213
 
211
214
  # remove percentage
212
215
  string = string.replace("\\%", "")
213
- string = string.replace("\%", "")
216
+ string = string.replace(r"\%", "")
214
217
 
215
218
  # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
216
219
  string = string.replace(" .", " 0.")
@@ -391,13 +394,13 @@ class MATHScenario(Scenario):
391
394
  for split, split_name in zip([TRAIN_SPLIT, TEST_SPLIT], ["train", "test"]):
392
395
  if split == TRAIN_SPLIT and self.use_official_examples:
393
396
  train_instances = [
394
- ("What is $\left(\\frac{7}{8}\\right)^3 \cdot \left(\\frac{7}{8}\\right)^{-3}$?", "1"),
397
+ ("What is $\\left(\\frac{7}{8}\\right)^3 \\cdot \\left(\\frac{7}{8}\\right)^{-3}$?", "1"),
395
398
  (
396
399
  "In how many ways can 4 books be selected from a shelf of 6 books"
397
400
  + " if the order in which the books are selected does not matter?",
398
401
  "15",
399
402
  ),
400
- ("Find the distance between the points $(2,1,-4)$ and $(5,8,-3).$", "\sqrt{59}"),
403
+ ("Find the distance between the points $(2,1,-4)$ and $(5,8,-3).$", "\\sqrt{59}"),
401
404
  (
402
405
  "The faces of an octahedral die are labeled with digits $1$ through $8$."
403
406
  + " What is the probability, expressed as a common fraction,"
@@ -449,3 +452,34 @@ class MATHScenario(Scenario):
449
452
  instances.append(instance)
450
453
 
451
454
  return instances
455
+
456
+ def get_metadata(self) -> ScenarioMetadata:
457
+ taxonomy = TaxonomyInfo(
458
+ task="numeric answer question answering",
459
+ what="math competitions (AMC, AIME, etc.)",
460
+ when="before 2021",
461
+ who="problem setters",
462
+ language="synthetic",
463
+ )
464
+ if self.use_chain_of_thought:
465
+ return ScenarioMetadata(
466
+ name="math_chain_of_thought",
467
+ display_name="MATH",
468
+ description="The MATH benchmark for measuring mathematical problem solving on competition "
469
+ "math problems with chain-of-thought style reasoning [(Hendrycks et al., "
470
+ "2021)](https://arxiv.org/pdf/2103.03874.pdf).",
471
+ taxonomy=taxonomy,
472
+ main_metric="math_equiv_chain_of_thought",
473
+ main_split="test",
474
+ )
475
+ else:
476
+ return ScenarioMetadata(
477
+ name="math_regular",
478
+ display_name="MATH",
479
+ description="The MATH benchmark for measuring mathematical problem solving on competition "
480
+ "math problems [(Hendrycks et al., "
481
+ "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
482
+ taxonomy=taxonomy,
483
+ main_metric="math_equiv",
484
+ main_split="test",
485
+ )