crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -5,6 +5,7 @@ from typing import List, Dict
5
5
 
6
6
  import pandas as pd
7
7
 
8
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
9
  from helm.benchmark.scenarios.scenario import (
9
10
  Input,
10
11
  Instance,
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
15
16
  CORRECT_TAG,
16
17
  TEST_SPLIT,
17
18
  TRAIN_SPLIT,
19
+ ScenarioMetadata,
18
20
  )
19
21
  from helm.common.general import ensure_file_downloaded
20
22
  from helm.common.hierarchical_logger import hlog
@@ -129,6 +131,27 @@ class TyDiQAScenario(Scenario):
129
131
  outputs.append(instance)
130
132
  return outputs
131
133
 
134
+ def get_metadata(self) -> ScenarioMetadata:
135
+ return ScenarioMetadata(
136
+ name="tydiqa",
137
+ display_name="TyDiQA",
138
+ short_display_name=None,
139
+ description="TyDiQA [(Clark, 2020)](https://aclanthology.org/2020.tacl-1.30) is an "
140
+ "open-book question answering dataset for 11 typologically-diverse languages. "
141
+ "The questions are written by people who want to know the answer, but do not "
142
+ "know the answer yet, and the data is collected directly in each language "
143
+ "without the use of translation.\n",
144
+ taxonomy=TaxonomyInfo(
145
+ task="question answering",
146
+ what="questions by human annotators about Wikipedia articles",
147
+ when="?",
148
+ who="human annotators",
149
+ language="Indonesian",
150
+ ),
151
+ main_metric="squad_f1_score",
152
+ main_split="test",
153
+ )
154
+
132
155
 
133
156
  # 1.2 Vietnamese & Thai: XQuAD
134
157
  class XQuADScenario(Scenario):
@@ -232,6 +255,28 @@ class XQuADScenario(Scenario):
232
255
  outputs.append(instance)
233
256
  return outputs
234
257
 
258
+ def get_metadata(self) -> ScenarioMetadata:
259
+ return ScenarioMetadata(
260
+ name=f"xquad_{self.language}",
261
+ display_name=f"XQuAD ({self.language})",
262
+ short_display_name=None,
263
+ description="XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book "
264
+ "question answering dataset that is parallel across 10 languages. The dataset "
265
+ "consists of a subset of 240 paragraphs and 1190 question-answer pairs from the "
266
+ "development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their "
267
+ "professional translations.\n",
268
+ taxonomy=TaxonomyInfo(
269
+ task="question answering",
270
+ what="questions by crowdworkers about Wikipedia articles translated "
271
+ f"from English to {self.language}",
272
+ when="?",
273
+ who="?",
274
+ language=self.language,
275
+ ),
276
+ main_metric="squad_f1_score",
277
+ main_split="test",
278
+ )
279
+
235
280
 
236
281
  # 1.3 Tamil: IndicQA
237
282
  class IndicQAScenario(Scenario):
@@ -341,6 +386,27 @@ class IndicQAScenario(Scenario):
341
386
  outputs.append(instance)
342
387
  return outputs
343
388
 
389
+ def get_metadata(self) -> ScenarioMetadata:
390
+ return ScenarioMetadata(
391
+ name="indicqa",
392
+ display_name="IndicQA",
393
+ short_display_name=None,
394
+ description="IndicQA [(Doddapaneni, 2023)](https://aclanthology.org/2023.acl-long.693)is an "
395
+ "open-book question answering dataset for 11 Indic languages. Answers to "
396
+ "questions are to be extracted from the text provided. The data is taken from "
397
+ "Wikipedia articles across various domains and questions and answers were "
398
+ "manually created by native speakers.\n",
399
+ taxonomy=TaxonomyInfo(
400
+ task="question answering",
401
+ what="questions about Wikipedia articles translated by native " "speakers from English to Tamil",
402
+ when="?",
403
+ who="?",
404
+ language="Tamil",
405
+ ),
406
+ main_metric="squad_f1_score",
407
+ main_split="test",
408
+ )
409
+
344
410
 
345
411
  # 2. Sentiment Analysis
346
412
  # 2.1 Indonesian: NusaX Sentiment
@@ -445,6 +511,25 @@ class NusaXScenario(Scenario):
445
511
  outputs.append(instance)
446
512
  return outputs
447
513
 
514
+ def get_metadata(self) -> ScenarioMetadata:
515
+ return ScenarioMetadata(
516
+ name="nusax",
517
+ display_name="NusaX",
518
+ short_display_name=None,
519
+ description="NusaX [(Winata, 2023)](https://aclanthology.org/2023.eacl-main.57) is an "
520
+ "Indonesian sentiment analysis dataset. The data consists of comments and "
521
+ "reviews from various online platforms.\n",
522
+ taxonomy=TaxonomyInfo(
523
+ task="sentiment analysis",
524
+ what="online comments and reviews",
525
+ when="?",
526
+ who="internet users",
527
+ language="Indonesian",
528
+ ),
529
+ main_metric="classification_macro_f1",
530
+ main_split="test",
531
+ )
532
+
448
533
 
449
534
  # 2.2 Vietnamese: UIT-VSFC
450
535
  class UITVSFCScenario(Scenario):
@@ -543,6 +628,25 @@ class UITVSFCScenario(Scenario):
543
628
  outputs.append(instance)
544
629
  return outputs
545
630
 
631
+ def get_metadata(self) -> ScenarioMetadata:
632
+ return ScenarioMetadata(
633
+ name="uitvsfc",
634
+ display_name="UIT-VSFC",
635
+ short_display_name=None,
636
+ description="UIT-VSFC [(Nguyen, 2018)](https://ieeexplore.ieee.org/document/8573337) is a "
637
+ "Vietnamese sentiment analysis dataset. The data consists of student feedback "
638
+ "obtained from end-of-semester surveys at a Vietnamese university.\n",
639
+ taxonomy=TaxonomyInfo(
640
+ task="sentiment analysis",
641
+ what="university student end-of-semester survey responses",
642
+ when="?",
643
+ who="university students",
644
+ language="Vietnamese",
645
+ ),
646
+ main_metric="classification_macro_f1",
647
+ main_split="test",
648
+ )
649
+
546
650
 
547
651
  # 2.3 Thai: Wisesight Sentiment
548
652
  class WisesightScenario(Scenario):
@@ -634,6 +738,25 @@ class WisesightScenario(Scenario):
634
738
  outputs.append(instance)
635
739
  return outputs
636
740
 
741
+ def get_metadata(self) -> ScenarioMetadata:
742
+ return ScenarioMetadata(
743
+ name="wisesight",
744
+ display_name="Wisesight",
745
+ short_display_name=None,
746
+ description="Wisesight [(Suriyawongkul, 2019)](https://doi.org/10.5281/zenodo.3457447) is "
747
+ "an Thai sentiment analysis scenario. The data consists of social media "
748
+ "messages regarding consumer products and services. \n",
749
+ taxonomy=TaxonomyInfo(
750
+ task="sentiment analysis",
751
+ what="social media messages regarding consumer products and services",
752
+ when="?",
753
+ who="social media users",
754
+ language="Thai",
755
+ ),
756
+ main_metric="classification_macro_f1",
757
+ main_split="test",
758
+ )
759
+
637
760
 
638
761
  # 2.4 Tamil: IndicSentiment
639
762
  class IndicSentimentScenario(Scenario):
@@ -723,6 +846,22 @@ class IndicSentimentScenario(Scenario):
723
846
  outputs.append(instance)
724
847
  return outputs
725
848
 
849
+ def get_metadata(self) -> ScenarioMetadata:
850
+ return ScenarioMetadata(
851
+ name="indicsentiment",
852
+ display_name="IndicSentiment",
853
+ short_display_name=None,
854
+ description="IndicSentiment is a Tamil sentiment analysis dataset that comes from "
855
+ "IndicXTREME [(Doddapaneni, "
856
+ "2022)](https://aclanthology.org/2023.acl-long.693/), and consists of product "
857
+ "reviews that were written by annotators. Labels are positive or negative.\n",
858
+ taxonomy=TaxonomyInfo(
859
+ task="sentiment analysis", what="product reviews", when="?", who="human annotators", language="Tamil"
860
+ ),
861
+ main_metric="classification_macro_f1",
862
+ main_split="test",
863
+ )
864
+
726
865
 
727
866
  # 3. Toxicity Detection/Classification
728
867
  # 3.1 Indonesian: Multi-Label Hate Speech Detection
@@ -835,6 +974,24 @@ class MLHSDScenario(Scenario):
835
974
  outputs.append(instance)
836
975
  return outputs
837
976
 
977
+ def get_metadata(self) -> ScenarioMetadata:
978
+ return ScenarioMetadata(
979
+ name="mlhsd",
980
+ display_name="MLHSD",
981
+ short_display_name=None,
982
+ description="MLHSD [(Ibrohim, 2019)](https://aclanthology.org/W19-3506) is an Indonesian "
983
+ "toxicity detection dataset obtained from tweets on Twitter.\n",
984
+ taxonomy=TaxonomyInfo(
985
+ task="toxicity detection/classification",
986
+ what="tweets",
987
+ when="?",
988
+ who="Twitter users",
989
+ language="Indonesian",
990
+ ),
991
+ main_metric="classification_macro_f1",
992
+ main_split="test",
993
+ )
994
+
838
995
 
839
996
  # 3.2 Vietnamese: ViHSD
840
997
  class ViHSDScenario(Scenario):
@@ -927,6 +1084,26 @@ class ViHSDScenario(Scenario):
927
1084
  outputs.append(instance)
928
1085
  return outputs
929
1086
 
1087
+ def get_metadata(self) -> ScenarioMetadata:
1088
+ return ScenarioMetadata(
1089
+ name="vihsd",
1090
+ display_name="ViHSD",
1091
+ short_display_name=None,
1092
+ description="ViHSD [(Luu, "
1093
+ "2021)](https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35 )is a "
1094
+ "Vietnamese toxicity detection dataset obtained from comments on Facebook, "
1095
+ "Youtube, Instagram, and Tiktok.\n",
1096
+ taxonomy=TaxonomyInfo(
1097
+ task="toxicity detection/classification",
1098
+ what="social media comments",
1099
+ when="?",
1100
+ who="Social media users",
1101
+ language="Vietnamese",
1102
+ ),
1103
+ main_metric="classification_macro_f1",
1104
+ main_split="test",
1105
+ )
1106
+
930
1107
 
931
1108
  # 3.3 Thai: Thai Toxicity Tweets
932
1109
  class ThaiToxicityTweetsScenario(Scenario):
@@ -1013,6 +1190,21 @@ class ThaiToxicityTweetsScenario(Scenario):
1013
1190
  outputs.append(instance)
1014
1191
  return outputs
1015
1192
 
1193
+ def get_metadata(self) -> ScenarioMetadata:
1194
+ return ScenarioMetadata(
1195
+ name="thaitoxicitytweets",
1196
+ display_name="Thai Toxicity Tweets",
1197
+ short_display_name=None,
1198
+ description="Thai Toxicity Tweets [(Sirihattasak, "
1199
+ "2018)](http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf) is a "
1200
+ "Thai toxicity detection dataset obtained from tweets on Twitter. \n",
1201
+ taxonomy=TaxonomyInfo(
1202
+ task="toxicity detection/classification", what="tweets", when="", who="Twitter users", language="Thai"
1203
+ ),
1204
+ main_metric="classification_macro_f1",
1205
+ main_split="test",
1206
+ )
1207
+
1016
1208
 
1017
1209
  # B. Natural Language Generation
1018
1210
  # 1. Machine Translation
@@ -1111,6 +1303,28 @@ class FloresScenario(Scenario):
1111
1303
  outputs.append(instance)
1112
1304
  return outputs
1113
1305
 
1306
+ def get_metadata(self) -> ScenarioMetadata:
1307
+ return ScenarioMetadata(
1308
+ name=f"flores_{self.source}_{self.target}",
1309
+ display_name=f"Flores ({self.source} to {self.target})",
1310
+ short_display_name=None,
1311
+ description="Flores [(NLLB Team, "
1312
+ "2022)](https://research.facebook.com/publications/no-language-left-behind/) "
1313
+ "was created with professional human translators who translate the FLORES "
1314
+ "source dataset into the target languages and a separate group of independent "
1315
+ "translation reviewers who perform quality assessments of the human "
1316
+ "translations and provide translation feedback to the translators.\n",
1317
+ taxonomy=TaxonomyInfo(
1318
+ task="machine translation",
1319
+ what="translations from professional human translators",
1320
+ when="?",
1321
+ who="professional human translators",
1322
+ language=f"{self.source}, {self.target}",
1323
+ ),
1324
+ main_metric="chr_f_plus_plus",
1325
+ main_split="test",
1326
+ )
1327
+
1114
1328
 
1115
1329
  # C. Natural Language Reasoning
1116
1330
  # 1. Natural Language Inference
@@ -1207,6 +1421,26 @@ class IndoNLIScenario(Scenario):
1207
1421
  outputs.append(instance)
1208
1422
  return outputs
1209
1423
 
1424
+ def get_metadata(self) -> ScenarioMetadata:
1425
+ return ScenarioMetadata(
1426
+ name="indonli",
1427
+ display_name="IndoNLI",
1428
+ short_display_name=None,
1429
+ description="IndoNLI [(Mahendra, 2021)](https://aclanthology.org/2021.emnlp-main.821) is a "
1430
+ "natural language inference dataset obtained from Wikipedia, news, and web "
1431
+ "articles that incorporates various linguistic phenomena such as numerical "
1432
+ "reasoning, structural changes, idioms, or temporal and spatial reasoning. \n",
1433
+ taxonomy=TaxonomyInfo(
1434
+ task="natural language inference",
1435
+ what="Wikipedia, news, and web articles",
1436
+ when="?",
1437
+ who="?",
1438
+ language="Indonesian",
1439
+ ),
1440
+ main_metric="exact_match",
1441
+ main_split="test",
1442
+ )
1443
+
1210
1444
 
1211
1445
  # 1.2 Vietnamese & Thai: XNLI
1212
1446
  class XNLIScenario(Scenario):
@@ -1305,6 +1539,25 @@ class XNLIScenario(Scenario):
1305
1539
  outputs.append(instance)
1306
1540
  return outputs
1307
1541
 
1542
+ def get_metadata(self) -> ScenarioMetadata:
1543
+ return ScenarioMetadata(
1544
+ name=f"xnli_{self.language}",
1545
+ display_name=f"XNLI ({self.language})",
1546
+ short_display_name=None,
1547
+ description="XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural "
1548
+ "language inference dataset obtained from crowdsourced NLI data then "
1549
+ "professionally translated across 14 other languages.\n",
1550
+ taxonomy=TaxonomyInfo(
1551
+ task="natural language inference",
1552
+ what="crowdsourced NLI data professionally translated",
1553
+ when="?",
1554
+ who="?",
1555
+ language=self.language,
1556
+ ),
1557
+ main_metric="exact_match",
1558
+ main_split="test",
1559
+ )
1560
+
1308
1561
 
1309
1562
  # 1.3 Tamil: IndicXNLI
1310
1563
  class IndicXNLIScenario(Scenario):
@@ -1398,6 +1651,25 @@ class IndicXNLIScenario(Scenario):
1398
1651
  outputs.append(instance)
1399
1652
  return outputs
1400
1653
 
1654
+ def get_metadata(self) -> ScenarioMetadata:
1655
+ return ScenarioMetadata(
1656
+ name="indicxnli",
1657
+ display_name="IndicXNLI",
1658
+ short_display_name=None,
1659
+ description="IndicXNLI is a Tamil sentiment analysis dataset that comes from IndicXTREME "
1660
+ "[(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), which "
1661
+ "automatically translated from XNLI into 11 Indic languages.\n",
1662
+ taxonomy=TaxonomyInfo(
1663
+ task="natural language inference",
1664
+ what="crowdsourced NLI data professionally translated into Tamil",
1665
+ when="?",
1666
+ who="?",
1667
+ language="Tamil",
1668
+ ),
1669
+ main_metric="exact_match",
1670
+ main_split="test",
1671
+ )
1672
+
1401
1673
 
1402
1674
  # 2. Causal Reasoning: XCOPA
1403
1675
  class XCOPAScenario(Scenario):
@@ -1529,6 +1801,25 @@ class XCOPAScenario(Scenario):
1529
1801
  outputs.append(instance)
1530
1802
  return outputs
1531
1803
 
1804
+ def get_metadata(self) -> ScenarioMetadata:
1805
+ return ScenarioMetadata(
1806
+ name=f"xcopa_{self.language}",
1807
+ display_name=f"XCOPA ({self.language})",
1808
+ short_display_name=None,
1809
+ description="XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal "
1810
+ "reasoning dataset, a translation and reannotation of the English COPA. English "
1811
+ "COPA included questions that directly assess commonsense causal reasoning.\n",
1812
+ taxonomy=TaxonomyInfo(
1813
+ task="causal reasoning",
1814
+ what="commonsense causal reasoning questions translated into " "Indonesian",
1815
+ when="?",
1816
+ who="?",
1817
+ language=self.language,
1818
+ ),
1819
+ main_metric="exact_match",
1820
+ main_split="test",
1821
+ )
1822
+
1532
1823
 
1533
1824
  # 1. Syntax: LINDSEA Minimal Pairs
1534
1825
  class LINDSEASyntaxMinimalPairsScenario(Scenario):
@@ -1650,6 +1941,26 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1650
1941
  outputs.append(instance)
1651
1942
  return outputs
1652
1943
 
1944
+ def get_metadata(self) -> ScenarioMetadata:
1945
+ return ScenarioMetadata(
1946
+ name=f"lindsea_syntax_minimal_pairs_{self.language}",
1947
+ display_name="LINDSEA Syntax Minimal Pairs",
1948
+ short_display_name=None,
1949
+ description="LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA "
1950
+ "[(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of "
1951
+ "sentences that differ minimally from each other and contrast in grammatical "
1952
+ "acceptability.\n",
1953
+ taxonomy=TaxonomyInfo(
1954
+ task="minimal pairs",
1955
+ what="sentence pairs with minimal differences and constrasting " "grammatical acceptability",
1956
+ when="?",
1957
+ who="?",
1958
+ language=self.language,
1959
+ ),
1960
+ main_metric="exact_match",
1961
+ main_split="test",
1962
+ )
1963
+
1653
1964
 
1654
1965
  # 2.1 Pragmatics: LINDSEA Presuppositions
1655
1966
  class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
@@ -1750,7 +2061,7 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1750
2061
  text_noun = self.prompt_components["text_noun"]
1751
2062
  instruction = self.prompt_components["single_instruction"]
1752
2063
 
1753
- passage = "{question}\{text_noun}: {text}\n{instruction}".format(
2064
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
1754
2065
  question=question.format(row["question_translated"]),
1755
2066
  text_noun=text_noun,
1756
2067
  text=row["text"],
@@ -1798,6 +2109,24 @@ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1798
2109
  outputs.append(instance)
1799
2110
  return outputs
1800
2111
 
2112
+ def get_metadata(self) -> ScenarioMetadata:
2113
+ return ScenarioMetadata(
2114
+ name=f"lindsea_pragmatics_presuppositions_{self.language}",
2115
+ display_name="LINDSEA Pragmatics Presuppositions",
2116
+ short_display_name=None,
2117
+ description="LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics "
2118
+ "dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), "
2119
+ "involving two formats: single and pair sentences. For single sentence "
2120
+ "questions, the system under test needs to determine if the sentence is "
2121
+ "true/false. For pair sentence questions, the system under test needs to "
2122
+ "determine whether a conclusion can be drawn from another sentence.\n",
2123
+ taxonomy=TaxonomyInfo(
2124
+ task="pragmatic reasoning", what="presuppositions", when="?", who="?", language=self.language
2125
+ ),
2126
+ main_metric="exact_match",
2127
+ main_split="test",
2128
+ )
2129
+
1801
2130
 
1802
2131
  # 2.2 Pragmatics: LINDSEA Scalar Implicatures
1803
2132
  class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
@@ -1898,7 +2227,7 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
1898
2227
  text_noun = self.prompt_components["text_noun"]
1899
2228
  instruction = self.prompt_components["single_instruction"]
1900
2229
 
1901
- passage = "{question}\{text_noun}: {text}\n{instruction}".format(
2230
+ passage = "{question}\\{text_noun}: {text}\n{instruction}".format(
1902
2231
  question=question.format(row["question_translated"]),
1903
2232
  text_noun=text_noun,
1904
2233
  text=row["text"],
@@ -1945,3 +2274,22 @@ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
1945
2274
  )
1946
2275
  outputs.append(instance)
1947
2276
  return outputs
2277
+
2278
+ def get_metadata(self) -> ScenarioMetadata:
2279
+ return ScenarioMetadata(
2280
+ name=f"lindsea_pragmatics_scalar_implicatures_{self.language}",
2281
+ display_name="LINDSEA Pragmatics Scalar Implicatures",
2282
+ short_display_name=None,
2283
+ description="LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for "
2284
+ "pragmatics dataset from BHASA [(Leong, "
2285
+ "2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and "
2286
+ "pair sentences. For single sentence questions, the system under test needs to "
2287
+ "determine if the sentence is true/false. For pair sentence questions, the "
2288
+ "system under test needs to determine whether a conclusion can be drawn from "
2289
+ "another sentence.\n",
2290
+ taxonomy=TaxonomyInfo(
2291
+ task="pragmatic reasoning", what="scalar implicatures", when="?", who="?", language=self.language
2292
+ ),
2293
+ main_metric="exact_match",
2294
+ main_split="test",
2295
+ )
@@ -2,8 +2,18 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
- from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ Reference,
10
+ Scenario,
11
+ Instance,
12
+ Input,
13
+ TEST_SPLIT,
14
+ Output,
15
+ ScenarioMetadata,
16
+ )
7
17
 
8
18
 
9
19
  class SelfInstructScenario(Scenario):
@@ -46,3 +56,21 @@ class SelfInstructScenario(Scenario):
46
56
  )
47
57
  instances.append(instance)
48
58
  return instances
59
+
60
+ def get_metadata(self) -> ScenarioMetadata:
61
+ return ScenarioMetadata(
62
+ name="self_instruct",
63
+ display_name="Self Instruct",
64
+ short_display_name="Self Instruct",
65
+ description="The manually-curated instructions from the Self-Instruct paper ([Wang et al., "
66
+ "2023](https://aclanthology.org/2023.acl-long.754.pdf)).",
67
+ taxonomy=TaxonomyInfo(
68
+ task="open-ended instruction following",
69
+ what="Instructions for LLMs",
70
+ when="2022",
71
+ who="Authors of the research paper",
72
+ language="English",
73
+ ),
74
+ main_metric="Helpfulness",
75
+ main_split="test",
76
+ )
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -73,3 +75,23 @@ class SHCBMTMedScenario(Scenario):
73
75
  )
74
76
 
75
77
  return instances
78
+
79
+ def get_metadata(self):
80
+ return ScenarioMetadata(
81
+ name="shc_bmt_med",
82
+ display_name="BMT-Status",
83
+ description="BMT-Status is a benchmark composed of clinical notes and associated binary "
84
+ "questions related to bone marrow transplant (BMT), hematopoietic stem cell "
85
+ "transplant (HSCT), or hematopoietic cell transplant (HCT) status. The goal is "
86
+ "to determine whether the patient received a subsequent transplant based on the "
87
+ "provided clinical documentation.",
88
+ taxonomy=TaxonomyInfo(
89
+ task="question answering",
90
+ what="Answer bone marrow transplant questions",
91
+ when="Any",
92
+ who="Researcher",
93
+ language="English",
94
+ ),
95
+ main_metric="exact_match",
96
+ main_split="test",
97
+ )
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -73,3 +75,21 @@ class SHCCDIMedScenario(Scenario):
73
75
  )
74
76
 
75
77
  return instances
78
+
79
+ def get_metadata(self):
80
+ return ScenarioMetadata(
81
+ name="shc_cdi_med",
82
+ display_name="CDI-QA",
83
+ description="CDI-QA is a benchmark constructed from Clinical Documentation Integrity (CDI) "
84
+ "notes. It is used to evaluate a model's ability to verify clinical conditions "
85
+ "based on documented evidence in patient records.",
86
+ taxonomy=TaxonomyInfo(
87
+ task="Classification",
88
+ what="Answer verification questions from CDI notes",
89
+ when="Any",
90
+ who="Hospital Admistrator",
91
+ language="English",
92
+ ),
93
+ main_metric="exact_match",
94
+ main_split="test",
95
+ )
@@ -2,6 +2,7 @@ import sys
2
2
  import csv
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  CORRECT_TAG,
11
12
  Reference,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import check_file_exists
15
17
 
@@ -74,3 +76,24 @@ class SHCCONFMedScenario(Scenario):
74
76
  )
75
77
 
76
78
  return instances
79
+
80
+ def get_metadata(self):
81
+ return ScenarioMetadata(
82
+ name="shc_conf_med",
83
+ display_name="MedConfInfo",
84
+ description="MedConfInfo is a benchmark comprising clinical notes from adolescent patients. "
85
+ "It is used to evaluate whether the content contains sensitive protected health "
86
+ "information (PHI) that should be restricted from parental access, in "
87
+ "accordance with adolescent confidentiality policies in clinical care. "
88
+ "[(Rabbani et al., "
89
+ "2024)](https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109).",
90
+ taxonomy=TaxonomyInfo(
91
+ task="Classification",
92
+ what="Identify sensitive health info in adolescent notes",
93
+ when="Any",
94
+ who="Clinician",
95
+ language="English",
96
+ ),
97
+ main_metric="exact_match",
98
+ main_split="test",
99
+ )