crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (333) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/bbq_metrics.py +12 -0
  25. helm/benchmark/metrics/classification_metrics.py +19 -1
  26. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  27. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  28. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  29. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  30. helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
  31. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  32. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  33. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  34. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  35. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  36. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  37. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  38. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  39. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  40. helm/benchmark/metrics/medec_metrics.py +25 -2
  41. helm/benchmark/metrics/metric.py +25 -0
  42. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  43. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  44. helm/benchmark/metrics/safety_metrics.py +13 -1
  45. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  46. helm/benchmark/metrics/summac/model_summac.py +2 -2
  47. helm/benchmark/metrics/summarization_metrics.py +129 -1
  48. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  49. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  50. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  51. helm/benchmark/presentation/run_display.py +13 -3
  52. helm/benchmark/presentation/run_entry.py +2 -2
  53. helm/benchmark/presentation/schema.py +5 -22
  54. helm/benchmark/presentation/summarize.py +180 -11
  55. helm/benchmark/presentation/taxonomy_info.py +20 -0
  56. helm/benchmark/run.py +1 -1
  57. helm/benchmark/run_expander.py +4 -0
  58. helm/benchmark/run_specs/arabic_run_specs.py +140 -16
  59. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  60. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  61. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  62. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  63. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  64. helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
  65. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
  66. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  67. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  68. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  69. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  70. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  71. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  72. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  73. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  74. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
  75. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
  76. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
  77. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
  78. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
  79. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  80. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  81. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  82. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  83. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  84. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  85. helm/benchmark/scenarios/bold_scenario.py +15 -0
  86. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  87. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  88. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  89. helm/benchmark/scenarios/clear_scenario.py +23 -0
  90. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  91. helm/benchmark/scenarios/code_scenario.py +28 -0
  92. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  93. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  94. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  95. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  96. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  97. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  98. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  99. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  100. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  101. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  102. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  103. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  104. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  105. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  106. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  107. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  108. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  109. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  110. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  111. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  112. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  113. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  114. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  115. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  116. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  117. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  118. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  119. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  120. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  121. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  122. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  123. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  124. helm/benchmark/scenarios/ice_scenario.py +21 -1
  125. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  126. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  127. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
  128. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  129. helm/benchmark/scenarios/koala_scenario.py +21 -1
  130. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  131. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  132. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  133. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  134. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  135. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  136. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  137. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  138. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  139. helm/benchmark/scenarios/math_scenario.py +33 -0
  140. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  141. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  142. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  143. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  144. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  145. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  146. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  147. helm/benchmark/scenarios/medec_scenario.py +23 -0
  148. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  149. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  150. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  151. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  152. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  153. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  154. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  155. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  156. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  157. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  158. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  159. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  160. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  161. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  162. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  163. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  164. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  165. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  166. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  167. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  168. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  169. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  170. helm/benchmark/scenarios/quac_scenario.py +14 -0
  171. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  172. helm/benchmark/scenarios/raft_scenario.py +15 -0
  173. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  174. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  175. helm/benchmark/scenarios/scenario.py +31 -0
  176. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  177. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  178. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  179. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  180. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  181. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  182. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  183. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  184. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  185. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  186. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  187. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  188. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  189. helm/benchmark/scenarios/spider_scenario.py +18 -0
  190. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  191. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  192. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  193. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  194. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  195. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  196. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  197. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  198. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  199. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  200. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  201. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  202. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  203. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  204. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  205. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  206. helm/benchmark/static/schema_arabic.yaml +55 -12
  207. helm/benchmark/static/schema_long_context.yaml +11 -30
  208. helm/benchmark/static/schema_medhelm.yaml +36 -0
  209. helm/benchmark/static/schema_slp.yaml +219 -0
  210. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  211. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  212. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  213. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  214. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  215. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  216. helm/benchmark/static_build/index.html +5 -6
  217. helm/clients/ai21_client.py +2 -0
  218. helm/clients/aleph_alpha_client.py +2 -0
  219. helm/clients/anthropic_client.py +7 -1
  220. helm/clients/audio_language/diva_llama_client.py +2 -0
  221. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  222. helm/clients/audio_language/llama_omni/constants.py +9 -0
  223. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  224. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  225. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  226. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  227. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  228. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  229. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  230. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  231. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  232. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  233. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  234. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  235. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  236. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  237. helm/clients/audio_language/llama_omni/utils.py +202 -0
  238. helm/clients/audio_language/llama_omni_client.py +2 -1
  239. helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
  240. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  241. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  242. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  243. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  244. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  245. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  246. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  247. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  248. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  249. helm/clients/bedrock_client.py +2 -0
  250. helm/clients/cohere_client.py +3 -0
  251. helm/clients/google_client.py +2 -0
  252. helm/clients/http_model_client.py +2 -0
  253. helm/clients/huggingface_client.py +2 -1
  254. helm/clients/ibm_client.py +3 -1
  255. helm/clients/image_generation/adobe_vision_client.py +2 -0
  256. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  257. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  258. helm/clients/image_generation/cogview2_client.py +2 -1
  259. helm/clients/image_generation/dalle2_client.py +2 -0
  260. helm/clients/image_generation/dalle_mini_client.py +2 -1
  261. helm/clients/image_generation/deep_floyd_client.py +2 -0
  262. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  263. helm/clients/image_generation/lexica_client.py +2 -0
  264. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  265. helm/clients/image_generation/mindalle_client.py +2 -1
  266. helm/clients/image_generation/together_image_generation_client.py +2 -0
  267. helm/clients/megatron_client.py +2 -0
  268. helm/clients/mistral_client.py +2 -0
  269. helm/clients/moderation_api_client.py +2 -0
  270. helm/clients/openai_client.py +36 -20
  271. helm/clients/openai_responses_client.py +27 -3
  272. helm/clients/openrouter_client.py +31 -0
  273. helm/clients/palmyra_client.py +2 -1
  274. helm/clients/reka_client.py +2 -1
  275. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  276. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  277. helm/clients/test_openrouter_client.py +69 -0
  278. helm/clients/together_client.py +52 -11
  279. helm/clients/vertexai_client.py +12 -2
  280. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  281. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  282. helm/clients/vision_language/idefics_client.py +2 -1
  283. helm/clients/vision_language/open_flamingo_client.py +2 -1
  284. helm/clients/vision_language/paligemma_client.py +2 -1
  285. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  286. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  287. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  288. helm/clients/writer_client.py +2 -0
  289. helm/common/hierarchical_logger.py +20 -0
  290. helm/common/optional_dependencies.py +1 -1
  291. helm/common/test_general.py +4 -0
  292. helm/config/model_deployments.yaml +300 -1
  293. helm/config/model_metadata.yaml +302 -9
  294. helm/config/tokenizer_configs.yaml +92 -4
  295. helm/proxy/example_queries.py +8 -8
  296. helm/proxy/server.py +2 -1
  297. helm/proxy/static/index.css +4 -0
  298. helm/proxy/static/index.js +7 -1
  299. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  300. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  301. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  302. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  303. helm/benchmark/metrics/medalign_metrics.py +0 -14
  304. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  305. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  306. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  307. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  308. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  309. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  310. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  311. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  312. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  313. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  314. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  315. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  316. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  317. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
  318. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
  319. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
  320. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
  321. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  322. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  323. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  324. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  325. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  326. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  327. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  328. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  329. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  330. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  331. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  332. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  333. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -55,6 +55,7 @@ import os
55
55
  import sys
56
56
  from typing import List, Dict, Iterable, Optional, cast
57
57
 
58
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
58
59
  from helm.common.general import ensure_file_downloaded
59
60
  from helm.common.hierarchical_logger import hlog
60
61
  from helm.benchmark.scenarios.code_scenario_helper import run as run_reindent
@@ -69,6 +70,7 @@ from helm.benchmark.scenarios.scenario import (
69
70
  CORRECT_TAG,
70
71
  Input,
71
72
  Output,
73
+ ScenarioMetadata,
72
74
  )
73
75
 
74
76
 
@@ -331,3 +333,29 @@ class CodeScenario(Scenario):
331
333
  raise ValueError(f"Unknown dataset: {self.dataset}")
332
334
 
333
335
  return cast(List[Instance], instances)
336
+
337
+ def get_metadata(self) -> ScenarioMetadata:
338
+ if self.dataset == "humaneval":
339
+ return ScenarioMetadata(
340
+ name="code_humaneval",
341
+ display_name="HumanEval (Code)",
342
+ description="The HumanEval benchmark for measuring functional correctness for synthesizing "
343
+ "programs from docstrings [(Chen et al., "
344
+ "2021)](https://arxiv.org/pdf/2107.03374.pdf).",
345
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
346
+ main_metric="pass",
347
+ main_split="test",
348
+ )
349
+ elif self.dataset == "apps":
350
+ return ScenarioMetadata(
351
+ name="code_apps",
352
+ display_name="APPS (Code)",
353
+ description="The APPS benchmark for measuring competence on code challenges [(Hendrycks et "
354
+ "al., "
355
+ "2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/c24cd76e1ce41366a4bbe8a49b02a028-Abstract-round2.html).",
356
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
357
+ main_metric="test_avg",
358
+ main_split="test",
359
+ )
360
+ else:
361
+ raise Exception(f"Unknown dataset {self.dataset}")
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
6
7
  from helm.common.hierarchical_logger import hlog
7
8
  from helm.benchmark.scenarios.scenario import (
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
14
15
  CORRECT_TAG,
15
16
  Input,
16
17
  Output,
18
+ ScenarioMetadata,
17
19
  )
18
20
 
19
21
 
@@ -72,6 +74,19 @@ class HellaSwagScenario(Scenario):
72
74
  assert len(answers) == 4
73
75
  return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
74
76
 
77
+ def get_metadata(self) -> ScenarioMetadata:
78
+ return ScenarioMetadata(
79
+ name="hellaswag",
80
+ display_name="HellaSwag",
81
+ description="The HellaSwag benchmark for commonsense reasoning in question answering "
82
+ "[(Zellers et al., 2019)](https://aclanthology.org/P19-1472/).",
83
+ taxonomy=TaxonomyInfo(
84
+ task="question answering", what="commonsense reasoning", when="?", who="?", language="English"
85
+ ),
86
+ main_metric="exact_match",
87
+ main_split="valid",
88
+ )
89
+
75
90
 
76
91
  class OpenBookQA(Scenario):
77
92
  name = "openbookqa"
@@ -113,6 +128,23 @@ class OpenBookQA(Scenario):
113
128
  assert item["question"]["choices"][correct_choice]["label"] == item["answerKey"]
114
129
  return _make_instance(question=question, answers=answers, correct_answer=correct_answer, split=split)
115
130
 
131
+ def get_metadata(self) -> ScenarioMetadata:
132
+ return ScenarioMetadata(
133
+ name="openbookqa",
134
+ display_name="OpenbookQA",
135
+ description="The OpenbookQA benchmark for commonsense-intensive open book question "
136
+ "answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).",
137
+ taxonomy=TaxonomyInfo(
138
+ task="multiple-choice question answering",
139
+ what="elementary science",
140
+ when="2018",
141
+ who="Amazon Mechnical Turk workers",
142
+ language="English",
143
+ ),
144
+ main_metric="exact_match",
145
+ main_split="test",
146
+ )
147
+
116
148
 
117
149
  class CommonSenseQAScenario(Scenario):
118
150
  name = "commonsenseqa"
@@ -0,0 +1,70 @@
1
+ rules:
2
+ - category: Root
3
+ expansions:
4
+ - text: ${Task} ${Style}
5
+
6
+ - category: Task
7
+ expansions:
8
+ - text: Explain ${HowTo}
9
+ - text: Explain ${Topic}
10
+ - text: Tell me a joke about ${Topic}
11
+ - text: Tell me a joke about ${TopicPair}
12
+ - text: What are the similarities between ${TopicPair}
13
+ - text: What are the differences between ${TopicPair}
14
+ - text: Tell me 5 surprising facts about ${Topic}
15
+ - text: Persuade me to learn about ${Topic}
16
+
17
+ - category: HowTo
18
+ expansions:
19
+ - text: how airplanes fly
20
+ - text: how maglev trains work
21
+ - text: how to grow tomatoes in the wintertime
22
+
23
+ - category: Topic
24
+ expansions:
25
+ - text: the quicksort algorithm
26
+ - text: stochastic gradient descent
27
+ - text: the Great Vowel Shift
28
+ - text: northern lights
29
+ - text: the Romantic period
30
+ - text: the Civil Rights movement
31
+ - text: the Pacific Northwest
32
+ - text: El Niño
33
+
34
+ - category: TopicPair
35
+ expansions:
36
+ - text: north and south
37
+ - text: gradient descent and gradient ascent
38
+ - text: vowels and consonants
39
+ - text: C and C++
40
+ - text: Google and Microsoft
41
+ - text: US and Canada
42
+ - text: cats and dogs
43
+ - text: the Baroque period and the Romantic period
44
+
45
+ - category: Style
46
+ expansions:
47
+ - text: as a paragraph.
48
+ - text: as a haiku.
49
+ - text: as a limerick.
50
+ - text: in the style of a Shakespeare sonnet.
51
+ - text: in the style of a court case.
52
+ - text: in the style of Snoop Dogg.
53
+ - text: so that a ${Age}-year old can understand it.
54
+ - text: in ${Language}.
55
+ - text: in 3 bullet points.
56
+ - text: in 8 bullet points.
57
+
58
+ - category: Age
59
+ expansions:
60
+ - text: "5"
61
+ - text: "9"
62
+ - text: "13"
63
+
64
+ - category: Language
65
+ expansions:
66
+ - text: Italian
67
+ - text: Greek
68
+ - text: Indian
69
+ - text: Chinese
70
+ - text: Thai
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import Dict, List, Any
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Input,
7
8
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  VALID_SPLIT,
12
13
  CORRECT_TAG,
13
14
  Output,
15
+ ScenarioMetadata,
14
16
  )
15
17
  from helm.common.general import ensure_file_downloaded
16
18
 
@@ -95,3 +97,22 @@ class ConvFinQACalcScenario(Scenario):
95
97
  for raw_instance in raw_instances:
96
98
  instances.append(self.convert_to_instance(raw_instance, split))
97
99
  return instances
100
+
101
+ def get_metadata(self) -> ScenarioMetadata:
102
+ return ScenarioMetadata(
103
+ name="conv_fin_qa_calc",
104
+ display_name="ConvFinQACalc",
105
+ short_display_name=None,
106
+ description="A mathematical calculation benchmark based on ConvFinQA: Exploring the Chain "
107
+ "of Numerical Reasoning in Conversational Finance Question Answering [(Chen ey "
108
+ "al., 2022)](https://arxiv.org/pdf/2210.03849.pdf).",
109
+ taxonomy=TaxonomyInfo(
110
+ task="question answering with numeric reasoning",
111
+ what="financial reports",
112
+ when="1999 to 2019",
113
+ who="financial experts",
114
+ language="English",
115
+ ),
116
+ main_metric="float_equiv",
117
+ main_split="valid",
118
+ )
@@ -3,8 +3,18 @@ import os
3
3
  import tqdm
4
4
  from typing import List
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Reference, CORRECT_TAG, TEST_SPLIT, Input, Output
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ CORRECT_TAG,
13
+ TEST_SPLIT,
14
+ Input,
15
+ Output,
16
+ ScenarioMetadata,
17
+ )
8
18
 
9
19
  datatag2hash_text = {
10
20
  # The "average" book.
@@ -81,3 +91,27 @@ class CopyrightScenario(Scenario):
81
91
  ),
82
92
  )
83
93
  return instances
94
+
95
+ def get_metadata(self) -> ScenarioMetadata:
96
+ if self.datatag in datatag2hash_code:
97
+ return ScenarioMetadata(
98
+ name="copyright_code",
99
+ display_name="Copyright (code)",
100
+ description="Scenario introduced in this work to measure copyright and memorization "
101
+ "behavior for code, based off of [Carlini et al. "
102
+ "(2021)](https://www.usenix.org/biblio-11958).",
103
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
104
+ main_metric="unknown",
105
+ main_split="test",
106
+ )
107
+ else:
108
+ return ScenarioMetadata(
109
+ name="copyright_text",
110
+ display_name="Copyright (text)",
111
+ description="Scenario introduced in this work to measure copyright and memorization "
112
+ "behavior for books, based off of [Carlini et al. "
113
+ "(2021)](https://www.usenix.org/biblio-11958).",
114
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
115
+ main_metric="unknown",
116
+ main_split="test",
117
+ )
@@ -6,6 +6,7 @@ from typing import Any, List, Dict
6
6
  import pandas as pd
7
7
  from pandas import DataFrame
8
8
 
9
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
9
10
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
10
11
  from helm.benchmark.scenarios.scenario import (
11
12
  Scenario,
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
16
17
  CORRECT_TAG,
17
18
  Input,
18
19
  Output,
20
+ ScenarioMetadata,
19
21
  )
20
22
 
21
23
 
@@ -238,3 +240,22 @@ class CtiToMitreScenario(Scenario):
238
240
  # return all instances
239
241
  all_instances = instances_train + instances_test
240
242
  return all_instances
243
+
244
+ def get_metadata(self) -> ScenarioMetadata:
245
+ return ScenarioMetadata(
246
+ name="cti_to_mitre",
247
+ display_name="CTI-to-MITRE Cyber Threat Intelligence",
248
+ short_display_name=None,
249
+ description="A classification benchmark based on Automatic Mapping of Unstructured Cyber "
250
+ "Threat Intelligence - An Experimental Study [(Orbinato et al., "
251
+ "2022)](https://arxiv.org/pdf/2208.12144.pdf).",
252
+ taxonomy=TaxonomyInfo(
253
+ task="text classification",
254
+ what="Descriptions of malicious techniques",
255
+ when="Before 2022",
256
+ who="Security professionals",
257
+ language="English",
258
+ ),
259
+ main_metric="quasi_exact_match",
260
+ main_split="test",
261
+ )
@@ -2,6 +2,7 @@ import datasets
2
2
  import os
3
3
  from typing import List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  CORRECT_TAG,
7
8
  Scenario,
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
10
11
  TEST_SPLIT,
11
12
  Input,
12
13
  Output,
14
+ ScenarioMetadata,
13
15
  )
14
16
  from helm.common.general import ensure_directory_exists
15
17
 
@@ -128,3 +130,19 @@ CREATE TABLE "trans" (
128
130
  instance = Instance(input=input, references=references, split=TEST_SPLIT)
129
131
  instances.append(instance)
130
132
  return instances
133
+
134
+ def get_metadata(self) -> ScenarioMetadata:
135
+ return ScenarioMetadata(
136
+ name="czech_bank_qa",
137
+ display_name="CzechBankQA",
138
+ description="The CzechBankQA",
139
+ taxonomy=TaxonomyInfo(
140
+ task="text-to-SQL",
141
+ what="queries from financial experts",
142
+ when="1999",
143
+ who="financial experts",
144
+ language="English",
145
+ ),
146
+ main_metric="error_rate",
147
+ main_split="test",
148
+ )
@@ -1,9 +1,19 @@
1
1
  import os
2
2
  import json
3
3
  from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import ensure_file_downloaded
5
6
  from helm.common.general import ensure_directory_exists
6
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, CORRECT_TAG, Output, VALID_SPLIT
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Input,
11
+ Reference,
12
+ CORRECT_TAG,
13
+ Output,
14
+ VALID_SPLIT,
15
+ ScenarioMetadata,
16
+ )
7
17
 
8
18
  TASKS = {
9
19
  "counterfactual": ["snli_premise", "snli_hypothesis"]
@@ -167,3 +177,14 @@ class DecodingTrustAdvDemoScenario(Scenario):
167
177
  if self.demo_name in ["cf", "zero"]:
168
178
  break
169
179
  return instances
180
+
181
+ def get_metadata(self) -> ScenarioMetadata:
182
+ return ScenarioMetadata(
183
+ name="decodingtrust_adv_demonstration",
184
+ display_name="DecodingTrust - Adversarial Demonstrations",
185
+ short_display_name="AdvDemo",
186
+ description="Robustness analysis of LM generations when facing adversarial demonstrations",
187
+ taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
188
+ main_metric="quasi_exact_match",
189
+ main_split="test",
190
+ )
@@ -1,8 +1,18 @@
1
1
  import os
2
2
  import json
3
3
  from typing import List, Dict
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import ensure_file_downloaded
5
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT, Reference, Output, CORRECT_TAG
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Input,
10
+ VALID_SPLIT,
11
+ Reference,
12
+ Output,
13
+ CORRECT_TAG,
14
+ ScenarioMetadata,
15
+ )
6
16
 
7
17
  TASK_DESCRIPTIONS = {
8
18
  "sst2": "For the given sentence, label the sentiment of the sentence as positive or negative. The answer should be "
@@ -119,3 +129,15 @@ class DecodingTrustAdvRobustnessScenario(Scenario):
119
129
  instances.append(instance)
120
130
 
121
131
  return instances
132
+
133
+ def get_metadata(self) -> ScenarioMetadata:
134
+ return ScenarioMetadata(
135
+ name="decodingtrust_adv_robustness",
136
+ display_name="DecodingTrust - AdvGLUE++",
137
+ short_display_name="AdvGLUE++",
138
+ description="Adversarial perturbations of the GLUE dataset generated against open-source "
139
+ "LLMs including Alpaca, Vicuna, and Stable-Vicuna",
140
+ taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
141
+ main_metric="quasi_exact_match",
142
+ main_split="test",
143
+ )
@@ -1,8 +1,18 @@
1
1
  import os
2
2
  import json
3
3
  from typing import List, Dict
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import ensure_file_downloaded
5
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, VALID_SPLIT, Reference, Output, CORRECT_TAG
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Input,
10
+ VALID_SPLIT,
11
+ Reference,
12
+ Output,
13
+ CORRECT_TAG,
14
+ ScenarioMetadata,
15
+ )
6
16
 
7
17
  TASK_DESCRIPTIONS = {
8
18
  "predict_annual_income": "For each snippet of text, predict whether the person described has an annual income "
@@ -75,3 +85,14 @@ class DecodingTrustFairnessScenario(Scenario):
75
85
  instances.append(instance)
76
86
 
77
87
  return instances
88
+
89
+ def get_metadata(self) -> ScenarioMetadata:
90
+ return ScenarioMetadata(
91
+ name="decodingtrust_fairness",
92
+ display_name="DecodingTrust - Fairness",
93
+ short_display_name="Fairness",
94
+ description="Fairness analysis of LLMs",
95
+ taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
96
+ main_metric="quasi_exact_match",
97
+ main_split="test",
98
+ )
@@ -4,8 +4,17 @@ import random
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  from typing import List, Dict, Tuple, Union
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_file_downloaded
8
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, CORRECT_TAG
9
+ from helm.benchmark.scenarios.scenario import (
10
+ Scenario,
11
+ Instance,
12
+ Input,
13
+ Reference,
14
+ Output,
15
+ CORRECT_TAG,
16
+ ScenarioMetadata,
17
+ )
9
18
 
10
19
  OPTION_DICT: Dict[str, Union[List[int], List[str]]] = {
11
20
  "ethics_commonsense_short": ["not wrong", "wrong"],
@@ -322,3 +331,14 @@ class DecodingTrustMachineEthicsScenario(Scenario):
322
331
  )
323
332
 
324
333
  return instances
334
+
335
+ def get_metadata(self) -> ScenarioMetadata:
336
+ return ScenarioMetadata(
337
+ name="decodingtrust_machine_ethics",
338
+ display_name="DecodingTrust - Ethics",
339
+ short_display_name="Ethics",
340
+ description="Evaluation of the understanding of ethical behaviors of LLMs",
341
+ taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
342
+ main_metric="quasi_exact_match",
343
+ main_split="test",
344
+ )
@@ -1,6 +1,7 @@
1
1
  import os
2
2
  import json
3
3
  from typing import List, Dict
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
4
5
  from helm.common.general import ensure_file_downloaded
5
6
  from helm.benchmark.scenarios.scenario import (
6
7
  Scenario,
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
11
12
  Reference,
12
13
  Output,
13
14
  CORRECT_TAG,
15
+ ScenarioMetadata,
14
16
  )
15
17
 
16
18
  TASK = {
@@ -202,3 +204,14 @@ class DecodingTrustOODRobustnessScenario(Scenario):
202
204
  instances.append(instance)
203
205
 
204
206
  return instances
207
+
208
+ def get_metadata(self) -> ScenarioMetadata:
209
+ return ScenarioMetadata(
210
+ name="decodingtrust_adv_demonstration",
211
+ display_name="DecodingTrust - Adversarial Demonstrations",
212
+ short_display_name="AdvDemo",
213
+ description="Robustness analysis of LM generations when facing adversarial demonstrations",
214
+ taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
215
+ main_metric="quasi_exact_match",
216
+ main_split="test",
217
+ )
@@ -4,8 +4,9 @@ import string
4
4
  import random
5
5
  from tqdm import tqdm
6
6
  from typing import List, Dict, Optional
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
7
8
  from helm.common.general import ensure_file_downloaded
8
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT
9
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Reference, Output, TEST_SPLIT, ScenarioMetadata
9
10
 
10
11
  option_keys = ["pii", "privacy_understanding", "enron_email_extraction"]
11
12
 
@@ -217,6 +218,17 @@ class DecodingTrustPrivacyScenario(Scenario):
217
218
 
218
219
  return instances
219
220
 
221
+ def get_metadata(self) -> ScenarioMetadata:
222
+ return ScenarioMetadata(
223
+ name="decodingtrust_privacy",
224
+ display_name="DecodingTrust - Privacy",
225
+ short_display_name="Privacy",
226
+ description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
227
+ taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
228
+ main_metric="quasi_exact_match",
229
+ main_split="test",
230
+ )
231
+
220
232
 
221
233
  def get_local_domain(email):
222
234
  return email.split("@")
@@ -1,7 +1,8 @@
1
1
  import json
2
2
  import os
3
3
  from typing import List, Dict
4
- from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import Reference, Output
7
8
 
@@ -66,3 +67,14 @@ class DecodingTrustStereotypeBiasScenario(Scenario):
66
67
  instances.append(instance)
67
68
 
68
69
  return instances
70
+
71
+ def get_metadata(self) -> ScenarioMetadata:
72
+ return ScenarioMetadata(
73
+ name="decodingtrust_stereotype_bias",
74
+ display_name="DecodingTrust - Stereotype Bias",
75
+ short_display_name="Stereotype",
76
+ description="Manually crafted stereotype user prompts from DecodingTrust",
77
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
78
+ main_metric="unknown",
79
+ main_split="test",
80
+ )
@@ -3,8 +3,9 @@ import os
3
3
  import random
4
4
  from typing import List, Dict
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded
7
- from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
8
9
 
9
10
 
10
11
  DATA_REPO_HASH = "38972f6ccbf376a8d0660babafb4d2b3b9cca3f4"
@@ -76,3 +77,14 @@ class DecodingTrustToxicityPromptsScenario(Scenario):
76
77
  random.shuffle(instances)
77
78
 
78
79
  return instances
80
+
81
+ def get_metadata(self) -> ScenarioMetadata:
82
+ return ScenarioMetadata(
83
+ name="decodingtrust_toxicity_prompts",
84
+ display_name="DecodingTrust - Toxicity",
85
+ short_display_name="Toxicity",
86
+ description="Evaluation of the privacy understanding and privacy preserving properties of " "LLMs",
87
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
88
+ main_metric="unknown",
89
+ main_split="test",
90
+ )
@@ -1,4 +1,5 @@
1
1
  from typing import List
2
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
2
3
  from helm.common.general import check_file_exists
3
4
  from helm.benchmark.scenarios.scenario import (
4
5
  Input,
@@ -8,6 +9,7 @@ from helm.benchmark.scenarios.scenario import (
8
9
  CORRECT_TAG,
9
10
  Reference,
10
11
  Output,
12
+ ScenarioMetadata,
11
13
  )
12
14
  import pandas as pd
13
15
 
@@ -170,3 +172,25 @@ class DischargeMeScenario(Scenario):
170
172
  lines = file.readlines()
171
173
  lines = [line.strip() for line in lines]
172
174
  return lines
175
+
176
+ def get_metadata(self):
177
+ return ScenarioMetadata(
178
+ name="dischargeme",
179
+ display_name="DischargeMe",
180
+ short_display_name="DischargeMe",
181
+ description="DischargeMe is a benchmark designed to evaluate clinical text generation. It "
182
+ "pairs discharge summaries and radiology reports from MIMIC-IV with generation "
183
+ "tasks such as writing discharge instructions or summarizing the brief hospital "
184
+ "course. The benchmark assesses a model's ability to generate patient-facing "
185
+ "documentation that is complete, empathetic, and clinically accurate [(Xu, "
186
+ "2024)](https://physionet.org/content/discharge-me/1.3/).",
187
+ taxonomy=TaxonomyInfo(
188
+ task="Text generation",
189
+ what="Generate discharge instructions from hospital notes",
190
+ when="Upon hospital discharge",
191
+ who="Clinician",
192
+ language="English",
193
+ ),
194
+ main_metric="dischargeme_accuracy",
195
+ main_split="test",
196
+ )
@@ -2,6 +2,7 @@ import json
2
2
  import os
3
3
  from typing import List, Dict, Optional
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_file_downloaded
6
7
  from helm.benchmark.scenarios.scenario import (
7
8
  Scenario,
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
12
13
  CORRECT_TAG,
13
14
  Input,
14
15
  Output,
16
+ ScenarioMetadata,
15
17
  )
16
18
 
17
19
  REITERATION_DATA_URL = "https://drive.google.com/uc?export=download&id=1uVJbsgPCHFAvH43I6SVvU3Ayo8dh-y_N"
@@ -175,3 +177,23 @@ class DisinformationScenario(Scenario):
175
177
  instances = self.create_wedging_instances(data)
176
178
 
177
179
  return instances
180
+
181
+ def get_metadata(self) -> ScenarioMetadata:
182
+ if self.capability == "reiteration":
183
+ name = "disinformation_reiteration"
184
+ display_name = "Disinformation (reiteration)"
185
+ elif self.capability == "wedging":
186
+ name = "disinformation_wedging"
187
+ display_name = "Disinformation (wedging)"
188
+ else:
189
+ raise Exception(f"Unknown capability {self.capability}")
190
+ return ScenarioMetadata(
191
+ name=name,
192
+ display_name=display_name,
193
+ description="Scenario from [Buchanan et al. "
194
+ "(2021)](https://cset.georgetown.edu/publication/truth-lies-and-automation/) "
195
+ "that tests the ability to generate divisive and wedging content.",
196
+ taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
197
+ main_metric="unknown",
198
+ main_split="valid",
199
+ )