crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show
  1. crfm_helm-0.5.10.dist-info/METADATA +369 -0
  2. crfm_helm-0.5.10.dist-info/RECORD +1008 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +80 -29
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  8. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  9. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  10. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
  11. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
  12. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
  13. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  14. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
  15. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  16. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  17. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
  18. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
  19. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  20. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
  21. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  22. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  23. helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
  24. helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
  25. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
  26. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
  27. helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  28. helm/benchmark/adaptation/prompt.py +1 -1
  29. helm/benchmark/adaptation/request_state.py +6 -1
  30. helm/benchmark/adaptation/scenario_state.py +6 -2
  31. helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  32. helm/benchmark/annotation/air_bench_annotator.py +79 -0
  33. helm/benchmark/annotation/alrage_annotator.py +90 -0
  34. helm/benchmark/annotation/annotator.py +48 -0
  35. helm/benchmark/annotation/annotator_factory.py +50 -0
  36. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  37. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  38. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  39. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  40. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  41. helm/benchmark/annotation/call_center_annotator.py +258 -0
  42. helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  43. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  44. helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  45. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  46. helm/benchmark/annotation/financebench_annotator.py +79 -0
  47. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  48. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  49. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  50. helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  51. helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  52. helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
  53. helm/benchmark/annotation/live_qa_annotator.py +76 -0
  54. helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  55. helm/benchmark/annotation/medalign_annotator.py +89 -0
  56. helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  57. helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  58. helm/benchmark/annotation/mental_health_annotator.py +87 -0
  59. helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  60. helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  61. helm/benchmark/annotation/model_as_judge.py +309 -0
  62. helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  63. helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  64. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  65. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  66. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  67. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  68. helm/benchmark/annotation/spider_annotator.py +18 -0
  69. helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  70. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  71. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  72. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  73. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  74. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  75. helm/benchmark/annotation/xstest_annotator.py +100 -0
  76. helm/benchmark/annotation_executor.py +144 -0
  77. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  78. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  79. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  80. helm/benchmark/augmentations/data_augmenter.py +0 -2
  81. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  82. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  83. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  84. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  85. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  86. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  87. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  88. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  89. helm/benchmark/augmentations/perturbation.py +26 -4
  90. helm/benchmark/augmentations/perturbation_description.py +1 -1
  91. helm/benchmark/augmentations/space_perturbation.py +2 -2
  92. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  93. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  94. helm/benchmark/augmentations/test_perturbation.py +56 -19
  95. helm/benchmark/augmentations/translate_perturbation.py +31 -0
  96. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  97. helm/benchmark/config_registry.py +7 -1
  98. helm/benchmark/data_preprocessor.py +2 -2
  99. helm/benchmark/executor.py +54 -25
  100. helm/benchmark/huggingface_registration.py +28 -10
  101. helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  102. helm/benchmark/metrics/alrage_metric.py +35 -0
  103. helm/benchmark/metrics/annotation_metrics.py +108 -0
  104. helm/benchmark/metrics/basic_metrics.py +437 -667
  105. helm/benchmark/metrics/bbq_metrics.py +17 -6
  106. helm/benchmark/metrics/bias_metrics.py +18 -9
  107. helm/benchmark/metrics/bias_word_lists.py +1 -1
  108. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  109. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  110. helm/benchmark/metrics/classification_metrics.py +107 -22
  111. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  112. helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
  113. helm/benchmark/metrics/code_metrics.py +5 -5
  114. helm/benchmark/metrics/code_metrics_helper.py +11 -3
  115. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  116. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  117. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  118. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  119. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  120. helm/benchmark/metrics/comet_metric.py +125 -0
  121. helm/benchmark/metrics/common_metric_specs.py +174 -0
  122. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  123. helm/benchmark/metrics/copyright_metrics.py +5 -5
  124. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  125. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  126. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  127. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  128. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  129. helm/benchmark/metrics/disinformation_metrics.py +8 -114
  130. helm/benchmark/metrics/dry_run_metrics.py +35 -6
  131. helm/benchmark/metrics/efficiency_metrics.py +287 -0
  132. helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  133. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  134. helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  135. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  136. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  137. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  138. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  139. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  140. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  141. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
  142. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  143. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  144. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  145. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  146. helm/benchmark/metrics/ifeval_metrics.py +67 -0
  147. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  148. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  149. helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  150. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  151. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  152. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  153. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  154. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  155. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  156. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  157. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  158. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  159. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  160. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  161. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  162. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  163. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  164. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  165. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  166. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  167. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  168. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  169. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  170. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  171. helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  172. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  173. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  174. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  175. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  176. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  177. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  178. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  179. helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
  180. helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  181. helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  182. helm/benchmark/metrics/live_qa_metrics.py +35 -0
  183. helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  184. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  185. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  186. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  187. helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  188. helm/benchmark/metrics/medec_metrics.py +124 -0
  189. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  190. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  191. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  192. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  193. helm/benchmark/metrics/metric.py +121 -175
  194. helm/benchmark/metrics/metric_name.py +0 -1
  195. helm/benchmark/metrics/metric_service.py +23 -7
  196. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  197. helm/benchmark/metrics/nltk_helper.py +32 -0
  198. helm/benchmark/metrics/omni_math_metrics.py +44 -0
  199. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  200. helm/benchmark/metrics/output_processing_metric.py +60 -0
  201. helm/benchmark/metrics/output_processors.py +15 -0
  202. helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
  203. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  204. helm/benchmark/metrics/ranking_metrics.py +5 -5
  205. helm/benchmark/metrics/reference_metric.py +148 -0
  206. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  207. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  208. helm/benchmark/metrics/safety_metrics.py +91 -0
  209. helm/benchmark/metrics/seahelm_metrics.py +201 -0
  210. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  211. helm/benchmark/metrics/spider_metrics.py +7 -0
  212. helm/benchmark/metrics/statistic.py +1 -1
  213. helm/benchmark/metrics/summac/model_summac.py +8 -11
  214. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  215. helm/benchmark/metrics/summarization_metrics.py +150 -11
  216. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  217. helm/benchmark/metrics/test_classification_metrics.py +145 -70
  218. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  219. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
  220. helm/benchmark/metrics/test_metric.py +3 -3
  221. helm/benchmark/metrics/test_statistic.py +2 -2
  222. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  223. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  224. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  225. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  226. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
  227. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  228. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
  229. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
  230. helm/benchmark/metrics/toxicity_metrics.py +37 -7
  231. helm/benchmark/metrics/toxicity_utils.py +23 -0
  232. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  233. helm/benchmark/metrics/unitxt_metrics.py +107 -0
  234. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  235. helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  236. helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  237. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  238. helm/benchmark/metrics/wildbench_metrics.py +54 -0
  239. helm/benchmark/model_deployment_registry.py +69 -5
  240. helm/benchmark/model_metadata_registry.py +58 -2
  241. helm/benchmark/multi_gpu_runner.py +133 -0
  242. helm/benchmark/presentation/contamination.py +3 -3
  243. helm/benchmark/presentation/create_plots.py +51 -20
  244. helm/benchmark/presentation/run_display.py +51 -12
  245. helm/benchmark/presentation/run_entry.py +2 -2
  246. helm/benchmark/presentation/schema.py +83 -66
  247. helm/benchmark/presentation/summarize.py +483 -388
  248. helm/benchmark/presentation/table.py +8 -8
  249. helm/benchmark/presentation/taxonomy_info.py +20 -0
  250. helm/benchmark/presentation/test_contamination.py +2 -2
  251. helm/benchmark/presentation/test_create_plots.py +4 -1
  252. helm/benchmark/presentation/test_run_entry.py +2 -2
  253. helm/benchmark/presentation/test_schema.py +11 -0
  254. helm/benchmark/presentation/test_summarize.py +148 -6
  255. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  256. helm/benchmark/reeval_run.py +202 -0
  257. helm/benchmark/reeval_runner.py +355 -0
  258. helm/benchmark/run.py +151 -87
  259. helm/benchmark/run_expander.py +418 -33
  260. helm/benchmark/run_spec.py +93 -0
  261. helm/benchmark/run_spec_factory.py +180 -0
  262. helm/benchmark/run_specs/__init__.py +0 -0
  263. helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  264. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  265. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  266. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  267. helm/benchmark/run_specs/call_center_run_specs.py +201 -0
  268. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  269. helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  270. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  271. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  272. helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  273. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  274. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  275. helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  276. helm/benchmark/run_specs/finance_run_specs.py +114 -0
  277. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  278. helm/benchmark/run_specs/heim_run_specs.py +625 -0
  279. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  280. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  281. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  282. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  283. helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  284. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  285. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  286. helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
  287. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  288. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  289. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  290. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  291. helm/benchmark/run_specs/safety_run_specs.py +191 -0
  292. helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  293. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  294. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  295. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  296. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  297. helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  298. helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  299. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  300. helm/benchmark/runner.py +63 -62
  301. helm/benchmark/runner_config_registry.py +21 -0
  302. helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  303. helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  304. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  305. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  306. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
  307. helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  308. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  309. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  310. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  311. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  312. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  313. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  314. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  315. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  316. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  317. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  318. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  319. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  320. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  321. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  322. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  323. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  324. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  325. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  326. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  327. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  328. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  329. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  330. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  331. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  332. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  333. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  334. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  335. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  336. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  337. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  338. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  339. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  340. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  341. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  342. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  343. helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
  344. helm/benchmark/scenarios/banking77_scenario.py +77 -0
  345. helm/benchmark/scenarios/bbq_scenario.py +17 -2
  346. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  347. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  348. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  349. helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  350. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  351. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  352. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  353. helm/benchmark/scenarios/bold_scenario.py +18 -3
  354. helm/benchmark/scenarios/boolq_scenario.py +21 -1
  355. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  356. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  357. helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  358. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  359. helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
  360. helm/benchmark/scenarios/clear_scenario.py +180 -0
  361. helm/benchmark/scenarios/cleva_scenario.py +482 -3
  362. helm/benchmark/scenarios/code_scenario.py +46 -4
  363. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  364. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  365. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  366. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  367. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  368. helm/benchmark/scenarios/commonsense_scenario.py +33 -1
  369. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  370. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  371. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  372. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  373. helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  374. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  375. helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  376. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  377. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  378. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  379. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  380. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  381. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  382. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  383. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  384. helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
  385. helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  386. helm/benchmark/scenarios/disinformation_scenario.py +32 -1
  387. helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
  388. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  389. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  390. helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  391. helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  392. helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
  393. helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
  394. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  395. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  396. helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  397. helm/benchmark/scenarios/financebench_scenario.py +74 -0
  398. helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  399. helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  400. helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  401. helm/benchmark/scenarios/grammar.py +2 -2
  402. helm/benchmark/scenarios/grammar_scenario.py +21 -2
  403. helm/benchmark/scenarios/gsm_scenario.py +31 -1
  404. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  405. helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  406. helm/benchmark/scenarios/headqa_scenario.py +158 -0
  407. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  408. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
  409. helm/benchmark/scenarios/ice_scenario.py +28 -4
  410. helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  411. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  412. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  413. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  414. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  415. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  416. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  417. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  418. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  419. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  420. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  421. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  422. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  423. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  424. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  425. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  426. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  427. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  428. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  429. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  430. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  431. helm/benchmark/scenarios/imdb_scenario.py +26 -3
  432. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  433. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  434. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  435. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  436. helm/benchmark/scenarios/koala_scenario.py +21 -1
  437. helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  438. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  439. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  440. helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
  441. helm/benchmark/scenarios/legal_support_scenario.py +24 -1
  442. helm/benchmark/scenarios/legalbench_scenario.py +45 -3
  443. helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
  444. helm/benchmark/scenarios/lextreme_scenario.py +22 -1
  445. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  446. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  447. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  448. helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
  449. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  450. helm/benchmark/scenarios/math_scenario.py +81 -22
  451. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  452. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  453. helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
  454. helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
  455. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  456. helm/benchmark/scenarios/med_qa_scenario.py +30 -1
  457. helm/benchmark/scenarios/medalign_scenario.py +117 -0
  458. helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  459. helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  460. helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
  461. helm/benchmark/scenarios/medec_scenario.py +148 -0
  462. helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  463. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  464. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  465. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  466. helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  467. helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  468. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  469. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  470. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  471. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  472. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  473. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  474. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  475. helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  476. helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  477. helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  478. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  479. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  480. helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  481. helm/benchmark/scenarios/mmlu_scenario.py +32 -1
  482. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  483. helm/benchmark/scenarios/msmarco_scenario.py +31 -1
  484. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  485. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  486. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  487. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
  488. helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
  489. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  490. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  491. helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  492. helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
  493. helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  494. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
  495. helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
  496. helm/benchmark/scenarios/quac_scenario.py +24 -1
  497. helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  498. helm/benchmark/scenarios/raft_scenario.py +33 -3
  499. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  500. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  501. helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  502. helm/benchmark/scenarios/scenario.py +44 -1
  503. helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  504. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  505. helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  506. helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  507. helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  508. helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  509. helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  510. helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  511. helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  512. helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  513. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  514. helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  515. helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  516. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  517. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  518. helm/benchmark/scenarios/spider_scenario.py +109 -0
  519. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  520. helm/benchmark/scenarios/summarization_scenario.py +48 -1
  521. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  522. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  523. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
  524. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  525. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  526. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  527. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  528. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  529. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  530. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  531. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  532. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  533. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  534. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  535. helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  536. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  537. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  538. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  539. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  540. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  541. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  542. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  543. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  544. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  545. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  546. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  547. helm/benchmark/scenarios/test_math_scenario.py +4 -3
  548. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  549. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  550. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  551. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  552. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  553. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  554. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  555. helm/benchmark/scenarios/test_scenario.py +6 -3
  556. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  557. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  558. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  559. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  560. helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  561. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  562. helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
  563. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  564. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  565. helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  566. helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
  567. helm/benchmark/scenarios/vicuna_scenario.py +22 -2
  568. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  569. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  570. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  571. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  572. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  573. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  574. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  575. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  576. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  577. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  578. helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  579. helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  580. helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  581. helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  582. helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  583. helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  584. helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  585. helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  586. helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  587. helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  588. helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  589. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  590. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  591. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  592. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  593. helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  594. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  595. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  596. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  597. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  598. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  599. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  600. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  601. helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  602. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  603. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  604. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  605. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  606. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
  607. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  608. helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
  609. helm/benchmark/scenarios/wikifact_scenario.py +31 -1
  610. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  611. helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  612. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  613. helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
  614. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  615. helm/benchmark/server.py +32 -2
  616. helm/benchmark/slurm_jobs.py +1 -2
  617. helm/benchmark/slurm_runner.py +78 -50
  618. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  619. helm/benchmark/static/schema_arabic.yaml +271 -0
  620. helm/benchmark/static/schema_audio.yaml +763 -0
  621. helm/benchmark/static/schema_autobencher.yaml +150 -0
  622. helm/benchmark/static/schema_call_center.yaml +269 -0
  623. helm/benchmark/static/schema_capabilities.yaml +254 -0
  624. helm/benchmark/static/schema_classic.yaml +259 -1140
  625. helm/benchmark/static/schema_cleva.yaml +768 -0
  626. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  627. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  628. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  629. helm/benchmark/static/schema_enterprise.yaml +319 -0
  630. helm/benchmark/static/schema_ewok.yaml +367 -0
  631. helm/benchmark/static/schema_finance.yaml +191 -0
  632. helm/benchmark/static/schema_heim.yaml +1389 -0
  633. helm/benchmark/static/schema_image2struct.yaml +588 -0
  634. helm/benchmark/static/schema_instruction_following.yaml +161 -0
  635. helm/benchmark/static/schema_legal.yaml +566 -0
  636. helm/benchmark/static/schema_lite.yaml +3 -286
  637. helm/benchmark/static/schema_long_context.yaml +282 -0
  638. helm/benchmark/static/schema_medhelm.yaml +1176 -0
  639. helm/benchmark/static/schema_melt.yaml +1257 -0
  640. helm/benchmark/static/schema_mmlu.yaml +1449 -0
  641. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  642. helm/benchmark/static/schema_safety.yaml +283 -0
  643. helm/benchmark/static/schema_seahelm.yaml +723 -0
  644. helm/benchmark/static/schema_slp.yaml +219 -0
  645. helm/benchmark/static/schema_slphelm.yaml +162 -0
  646. helm/benchmark/static/schema_social_audio.yaml +224 -0
  647. helm/benchmark/static/schema_sql.yaml +171 -0
  648. helm/benchmark/static/schema_thai.yaml +244 -0
  649. helm/benchmark/static/schema_torr.yaml +474 -0
  650. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  651. helm/benchmark/static/schema_unitxt.yaml +370 -0
  652. helm/benchmark/static/schema_vhelm.yaml +933 -0
  653. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  654. helm/benchmark/static/schema_video.yaml +219 -0
  655. helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  656. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  657. helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  658. helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  659. helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  660. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  661. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  662. helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  663. helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  664. helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  665. helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  666. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  667. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  668. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  669. helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  670. helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  671. helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  672. helm/benchmark/static_build/config.js +4 -0
  673. helm/benchmark/static_build/index.html +19 -0
  674. helm/benchmark/test_data_preprocessor.py +3 -3
  675. helm/benchmark/test_run_expander.py +1 -1
  676. helm/benchmark/window_services/default_window_service.py +3 -45
  677. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
  678. helm/benchmark/window_services/ice_window_service.py +1 -35
  679. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  680. helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  681. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  682. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  683. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  684. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  685. helm/benchmark/window_services/local_window_service.py +22 -5
  686. helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
  687. helm/benchmark/window_services/test_bloom_window_service.py +5 -4
  688. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  689. helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
  690. helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
  691. helm/benchmark/window_services/test_gptj_window_service.py +11 -5
  692. helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
  693. helm/benchmark/window_services/test_openai_window_service.py +18 -12
  694. helm/benchmark/window_services/test_opt_window_service.py +6 -5
  695. helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
  696. helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
  697. helm/benchmark/window_services/test_t511b_window_service.py +5 -4
  698. helm/benchmark/window_services/test_ul2_window_service.py +5 -4
  699. helm/benchmark/window_services/test_utils.py +6 -6
  700. helm/benchmark/window_services/test_yalm_window_service.py +5 -4
  701. helm/benchmark/window_services/tokenizer_service.py +7 -13
  702. helm/benchmark/window_services/window_service.py +42 -0
  703. helm/benchmark/window_services/window_service_factory.py +4 -1
  704. helm/benchmark/window_services/yalm_window_service.py +1 -28
  705. helm/clients/__init__.py +0 -0
  706. helm/{proxy/clients → clients}/ai21_client.py +78 -12
  707. helm/clients/aleph_alpha_client.py +114 -0
  708. helm/{proxy/clients → clients}/anthropic_client.py +304 -21
  709. helm/clients/audio_language/__init__.py +0 -0
  710. helm/clients/audio_language/diva_llama_client.py +122 -0
  711. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  712. helm/clients/audio_language/llama_omni/constants.py +9 -0
  713. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  714. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  715. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  716. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  717. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  718. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  719. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  720. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  721. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  722. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  723. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  724. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  725. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  726. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  727. helm/clients/audio_language/llama_omni/utils.py +202 -0
  728. helm/clients/audio_language/llama_omni_client.py +199 -0
  729. helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  730. helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  731. helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  732. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  733. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  734. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  735. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  736. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  737. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  738. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  739. helm/clients/audio_language/test.py +62 -0
  740. helm/{proxy/clients → clients}/auto_client.py +72 -31
  741. helm/clients/azure_openai_client.py +55 -0
  742. helm/clients/bedrock_client.py +381 -0
  743. helm/clients/bedrock_utils.py +105 -0
  744. helm/{proxy/clients → clients}/client.py +92 -17
  745. helm/clients/clip_score_client.py +49 -0
  746. helm/clients/clip_scorers/__init__.py +0 -0
  747. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  748. helm/clients/clip_scorers/clip_scorer.py +50 -0
  749. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  750. helm/{proxy/clients → clients}/cohere_client.py +105 -14
  751. helm/clients/dspy_client.py +135 -0
  752. helm/clients/gcs_client.py +82 -0
  753. helm/{proxy/clients → clients}/google_client.py +8 -6
  754. helm/clients/google_translate_client.py +35 -0
  755. helm/clients/grok_client.py +36 -0
  756. helm/{proxy/clients → clients}/http_model_client.py +8 -8
  757. helm/{proxy/clients → clients}/huggingface_client.py +157 -86
  758. helm/clients/huggingface_pipeline_client.py +138 -0
  759. helm/clients/ibm_client.py +269 -0
  760. helm/clients/image_generation/__init__.py +0 -0
  761. helm/clients/image_generation/adobe_vision_client.py +80 -0
  762. helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  763. helm/clients/image_generation/cogview2/__init__.py +0 -0
  764. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  765. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  766. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  767. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  768. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  769. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  770. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  771. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  772. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  773. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  774. helm/clients/image_generation/cogview2_client.py +192 -0
  775. helm/clients/image_generation/dalle2_client.py +194 -0
  776. helm/clients/image_generation/dalle3_client.py +108 -0
  777. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  778. helm/clients/image_generation/dalle_mini/data.py +442 -0
  779. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  780. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  781. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  782. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  783. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  784. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  785. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  786. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  787. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  788. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  789. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  790. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  791. helm/clients/image_generation/dalle_mini_client.py +191 -0
  792. helm/clients/image_generation/deep_floyd_client.py +80 -0
  793. helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  794. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  795. helm/clients/image_generation/lexica_client.py +88 -0
  796. helm/clients/image_generation/mindalle/__init__.py +0 -0
  797. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  798. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  799. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  800. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  801. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  802. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  803. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  804. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  805. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  806. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  807. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  808. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  809. helm/clients/image_generation/mindalle_client.py +116 -0
  810. helm/clients/image_generation/nudity_check_client.py +64 -0
  811. helm/clients/image_generation/together_image_generation_client.py +113 -0
  812. helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
  813. helm/{proxy/clients → clients}/megatron_client.py +7 -5
  814. helm/clients/mistral_client.py +180 -0
  815. helm/clients/moderation_api_client.py +111 -0
  816. helm/clients/nvidia_nim_client.py +32 -0
  817. helm/clients/open_lm_client.py +43 -0
  818. helm/clients/openai_client.py +604 -0
  819. helm/clients/openai_responses_client.py +200 -0
  820. helm/clients/openrouter_client.py +31 -0
  821. helm/{proxy/clients → clients}/palmyra_client.py +31 -14
  822. helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
  823. helm/clients/reka_client.py +190 -0
  824. helm/clients/simple_client.py +64 -0
  825. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  826. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  827. helm/clients/stanfordhealthcare_google_client.py +43 -0
  828. helm/clients/stanfordhealthcare_http_model_client.py +95 -0
  829. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  830. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  831. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  832. helm/clients/test_client.py +98 -0
  833. helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
  834. helm/clients/test_openrouter_client.py +69 -0
  835. helm/clients/test_simple_client.py +19 -0
  836. helm/clients/test_together_client.py +184 -0
  837. helm/clients/together_client.py +599 -0
  838. helm/clients/upstage_client.py +23 -0
  839. helm/clients/vertexai_client.py +488 -0
  840. helm/clients/vision_language/__init__.py +0 -0
  841. helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  842. helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  843. helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
  844. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  845. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  846. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  847. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  848. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  849. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  850. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  851. helm/clients/vision_language/open_flamingo_client.py +155 -0
  852. helm/clients/vision_language/paligemma_client.py +147 -0
  853. helm/clients/vision_language/palmyra_vision_client.py +101 -0
  854. helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  855. helm/clients/vision_language/qwen_vlm_client.py +174 -0
  856. helm/clients/vllm_client.py +80 -0
  857. helm/clients/vllm_granite_thinking_client.py +56 -0
  858. helm/clients/writer_client.py +105 -0
  859. helm/clients/yi_client.py +28 -0
  860. helm/common/audio_utils.py +111 -0
  861. helm/common/cache.py +23 -33
  862. helm/common/cache_backend_config.py +47 -0
  863. helm/common/clip_score_request.py +41 -0
  864. helm/common/context.py +80 -0
  865. helm/common/credentials_utils.py +5 -5
  866. helm/common/critique_request.py +10 -2
  867. helm/common/file_caches/__init__.py +0 -0
  868. helm/common/file_caches/file_cache.py +16 -0
  869. helm/common/file_caches/local_file_cache.py +61 -0
  870. helm/common/file_caches/test_local_file_cache.py +25 -0
  871. helm/common/file_upload_request.py +27 -0
  872. helm/common/general.py +10 -3
  873. helm/common/hierarchical_logger.py +124 -12
  874. helm/common/image_generation_parameters.py +25 -0
  875. helm/common/images_utils.py +60 -5
  876. helm/common/key_value_store.py +41 -10
  877. helm/common/local_context.py +140 -0
  878. helm/common/media_object.py +14 -1
  879. helm/common/moderations_api_request.py +71 -0
  880. helm/common/mongo_key_value_store.py +8 -7
  881. helm/common/multimodal_request_utils.py +57 -0
  882. helm/common/nudity_check_request.py +29 -0
  883. helm/common/object_spec.py +23 -8
  884. helm/common/optional_dependencies.py +1 -1
  885. helm/common/reeval_parameters.py +12 -0
  886. helm/common/remote_context.py +61 -0
  887. helm/common/request.py +45 -19
  888. helm/common/response_format.py +18 -0
  889. helm/common/test_cache.py +1 -48
  890. helm/common/test_general.py +10 -0
  891. helm/common/test_logging.py +94 -0
  892. helm/common/test_media_object.py +1 -1
  893. helm/common/tokenization_request.py +1 -10
  894. helm/config/model_deployments.yaml +4713 -1005
  895. helm/config/model_metadata.yaml +4045 -255
  896. helm/config/tokenizer_configs.yaml +1091 -50
  897. helm/proxy/accounts.py +31 -4
  898. helm/proxy/cli.py +6 -4
  899. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  900. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  901. helm/proxy/critique/model_critique_client.py +40 -10
  902. helm/proxy/example_queries.py +33 -28
  903. helm/proxy/retry.py +5 -0
  904. helm/proxy/server.py +82 -18
  905. helm/proxy/services/remote_service.py +32 -7
  906. helm/proxy/services/server_service.py +71 -69
  907. helm/proxy/services/service.py +30 -6
  908. helm/proxy/services/test_remote_service.py +6 -5
  909. helm/proxy/services/test_service.py +1 -13
  910. helm/proxy/static/help.html +99 -0
  911. helm/proxy/static/index.css +61 -0
  912. helm/proxy/static/index.html +40 -0
  913. helm/proxy/static/index.js +462 -0
  914. helm/proxy/test_accounts.py +32 -0
  915. helm/proxy/test_retry.py +1 -1
  916. helm/proxy/token_counters/auto_token_counter.py +37 -37
  917. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  918. helm/proxy/token_counters/token_counter.py +3 -5
  919. helm/tokenizers/__init__.py +0 -0
  920. helm/tokenizers/ai21_tokenizer.py +52 -0
  921. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
  922. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
  923. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
  924. helm/tokenizers/cohere_tokenizer.py +50 -0
  925. helm/tokenizers/grok_tokenizer.py +55 -0
  926. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
  927. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
  928. helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
  929. helm/tokenizers/simple_tokenizer.py +33 -0
  930. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  931. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
  932. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  933. helm/tokenizers/test_grok_tokenizer.py +33 -0
  934. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
  935. helm/tokenizers/test_simple_tokenizer.py +33 -0
  936. helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
  937. helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
  938. helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
  939. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  940. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
  941. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  942. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  943. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  944. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  945. crfm_helm-0.4.0.dist-info/METADATA +0 -264
  946. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  947. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  948. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  949. helm/benchmark/data_overlap/light_scenario.py +0 -60
  950. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  951. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  952. helm/benchmark/run_specs.py +0 -2762
  953. helm/benchmark/scenarios/numeracy_scenario.py +0 -784
  954. helm/benchmark/static/benchmarking.css +0 -156
  955. helm/benchmark/static/benchmarking.js +0 -1705
  956. helm/benchmark/static/config.js +0 -3
  957. helm/benchmark/static/images/helm-logo.png +0 -0
  958. helm/benchmark/static/images/language-model-helm.png +0 -0
  959. helm/benchmark/static/images/organizations/ai21.png +0 -0
  960. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  961. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  962. helm/benchmark/static/images/organizations/cohere.png +0 -0
  963. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  964. helm/benchmark/static/images/organizations/google.png +0 -0
  965. helm/benchmark/static/images/organizations/meta.png +0 -0
  966. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  967. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  968. helm/benchmark/static/images/organizations/openai.png +0 -0
  969. helm/benchmark/static/images/organizations/together.png +0 -0
  970. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  971. helm/benchmark/static/images/organizations/yandex.png +0 -0
  972. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  973. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  974. helm/benchmark/static/index.html +0 -68
  975. helm/benchmark/static/json-urls.js +0 -69
  976. helm/benchmark/static/plot-captions.js +0 -27
  977. helm/benchmark/static/utils.js +0 -285
  978. helm/benchmark/test_model_deployment_definition.py +0 -92
  979. helm/benchmark/test_model_properties.py +0 -1570
  980. helm/benchmark/vlm_run_specs.py +0 -97
  981. helm/benchmark/window_services/ai21_window_service.py +0 -258
  982. helm/benchmark/window_services/cohere_window_service.py +0 -163
  983. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  984. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  985. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  986. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  987. helm/benchmark/window_services/t511b_window_service.py +0 -30
  988. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  989. helm/benchmark/window_services/test_cohere_window_service.py +0 -74
  990. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  991. helm/benchmark/window_services/test_ice_window_service.py +0 -326
  992. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  993. helm/benchmark/window_services/ul2_window_service.py +0 -30
  994. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  995. helm/common/cache_utils.py +0 -14
  996. helm/proxy/clients/aleph_alpha_client.py +0 -95
  997. helm/proxy/clients/goose_ai_client.py +0 -99
  998. helm/proxy/clients/microsoft_client.py +0 -180
  999. helm/proxy/clients/openai_client.py +0 -206
  1000. helm/proxy/clients/simple_client.py +0 -60
  1001. helm/proxy/clients/test_client.py +0 -49
  1002. helm/proxy/clients/test_together_client.py +0 -97
  1003. helm/proxy/clients/together_client.py +0 -334
  1004. helm/proxy/clients/vertexai_client.py +0 -115
  1005. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  1006. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  1007. helm/proxy/token_counters/free_token_counter.py +0 -12
  1008. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  1009. helm/proxy/token_counters/openai_token_counter.py +0 -22
  1010. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  1011. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  1012. helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
  1013. helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
  1014. helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
  1015. helm/proxy/tokenizers/ice_tokenizer.py +0 -30
  1016. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  1017. helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
  1018. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  1019. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
  1020. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  1021. /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
  1022. /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
  1023. /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
  1024. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  1025. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  1026. /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
  1027. /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
  1028. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  1029. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  1030. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  1031. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  1032. /helm/{benchmark → proxy}/static/general.js +0 -0
  1033. /helm/{benchmark → proxy}/static/info-icon.png +0 -0
@@ -0,0 +1,71 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ Input,
11
+ Output,
12
+ CORRECT_TAG,
13
+ ScenarioMetadata,
14
+ )
15
+ from helm.common.general import ensure_directory_exists
16
+
17
+
18
+ class OmniMATHScenario(Scenario):
19
+ """Omni-MATH: A Universal Olympiad Level Mathematic Benchmark for Large Language Models
20
+
21
+ Omni-MATH is a comprehensive and challenging benchmark specifically designed to assess LLMs' mathematical
22
+ reasoning at the Olympiad level. The dataset focuses exclusively on Olympiad mathematics and comprises a \
23
+ vast collection of 4428 competition-level problems. These problems are meticulously categorized into 33 \
24
+ (and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced \
25
+ analysis of model performance across various mathematical disciplines and levels of complexity.."""
26
+
27
+ name = "omni_math"
28
+ description = "A Universal Olympiad Level Mathematic Benchmark for Large Language Models"
29
+ tags = ["math"]
30
+
31
+ def get_instances(self, output_path: str) -> List[Instance]:
32
+ # Get Omni-MATH from HuggingFace
33
+ cache_dir = os.path.join(output_path, "data")
34
+ ensure_directory_exists(cache_dir)
35
+ dataset = datasets.load_dataset(
36
+ "KbsdJames/Omni-MATH",
37
+ revision="40ba231d8f16e29ecd40e6407e2c8640145a8f62",
38
+ cache_dir=cache_dir,
39
+ split="test",
40
+ )
41
+ assert isinstance(dataset, datasets.Dataset)
42
+
43
+ # Read all instances
44
+ instances: List[Instance] = []
45
+ for idx, row in enumerate(dataset):
46
+
47
+ input = Input(text=row["problem"])
48
+ instance = Instance(
49
+ input=input,
50
+ references=[Reference(Output(text=row["answer"]), tags=[CORRECT_TAG])],
51
+ split=TEST_SPLIT,
52
+ )
53
+ instances.append(instance)
54
+
55
+ return instances
56
+
57
+ def get_metadata(self) -> ScenarioMetadata:
58
+ return ScenarioMetadata(
59
+ name=self.name,
60
+ display_name="Omni-MATH",
61
+ description=self.description,
62
+ main_metric="omni_math_accuracy",
63
+ main_split="test",
64
+ taxonomy=TaxonomyInfo(
65
+ task="mathematics",
66
+ what="universal Olympiad level mathematic benchmark",
67
+ who="human annotators",
68
+ when="2024",
69
+ language="English",
70
+ ),
71
+ )
@@ -2,7 +2,18 @@ from typing import List, Dict, Any, DefaultDict
2
2
  from datasets import load_dataset, Dataset
3
3
  from collections import defaultdict
4
4
 
5
- from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TRAIN_SPLIT, VALID_SPLIT, Output
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
+ from helm.benchmark.scenarios.scenario import (
7
+ CORRECT_TAG,
8
+ Reference,
9
+ Scenario,
10
+ Instance,
11
+ Input,
12
+ TRAIN_SPLIT,
13
+ VALID_SPLIT,
14
+ Output,
15
+ ScenarioMetadata,
16
+ )
6
17
 
7
18
 
8
19
  class OpenAssistantScenario(Scenario):
@@ -110,10 +121,30 @@ class OpenAssistantScenario(Scenario):
110
121
  return instances
111
122
 
112
123
  # Download the raw data from Huggingface
113
- dataset: Any = load_dataset("OpenAssistant/oasst1")
124
+ dataset: Any = load_dataset("OpenAssistant/oasst1", revision="fdf72ae0827c1cda404aff25b6603abec9e3399b")
114
125
 
115
126
  # Get the instances for each split
116
127
  train_instances = get_split_instances(dataset["train"], TRAIN_SPLIT)
117
128
  valid_instances = get_split_instances(dataset["validation"], VALID_SPLIT)
118
129
 
119
130
  return train_instances + valid_instances
131
+
132
+ def get_metadata(self) -> ScenarioMetadata:
133
+ return ScenarioMetadata(
134
+ name="open_assistant",
135
+ display_name="Open Assistant",
136
+ short_display_name="Open Assistant",
137
+ description="LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 "
138
+ "conversation trees ([Köpf et al., "
139
+ "2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial "
140
+ "prompt in each conversation.",
141
+ taxonomy=TaxonomyInfo(
142
+ task="open-ended instruction following",
143
+ what="Human-written dialogues and response rankings",
144
+ when="2023",
145
+ who="Open Assistant participants",
146
+ language="35 languages",
147
+ ),
148
+ main_metric="Helpfulness",
149
+ main_split="valid",
150
+ )
@@ -0,0 +1,94 @@
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import List, Optional
5
+
6
+ import datasets
7
+ import tiktoken
8
+
9
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
10
+ from helm.benchmark.scenarios.scenario import (
11
+ CORRECT_TAG,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ Instance,
16
+ TEST_SPLIT,
17
+ Input,
18
+ ScenarioMetadata,
19
+ )
20
+ from helm.common.general import ensure_directory_exists
21
+
22
+
23
+ class OpenAIMRCRScenario(Scenario):
24
+ """OpenAI MRCR scenario
25
+
26
+ OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking
27
+ an LLM's ability to distinguish between multiple needles hidden in context. This eval is
28
+ inspired by the MRCR eval first introduced by Gemini (https://arxiv.org/pdf/2409.12640v2).
29
+
30
+ The task is as follows: The model is given a long, multi-turn, synthetically generated
31
+ conversation between user and model where the user asks for a piece of writing about a topic,
32
+ e.g. "write a poem about tapirs" or "write a blog post about rocks". Hidden in this conversation
33
+ are 2, 4, or 8 identical asks, and the model is ultimately prompted to return the i-th instance
34
+ of one of those asks. For example, "Return the 2nd poem about tapirs".
35
+
36
+ Reference: https://huggingface.co/datasets/openai/mrcr"""
37
+
38
+ name = "openai_mrcr"
39
+ description = "OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2)." # noqa: E501
40
+ tags = ["long_context", "mrcr"]
41
+
42
+ NEEDLES_OPTIONS = [2, 4, 8]
43
+
44
+ def __init__(self, needles: int, max_num_words: Optional[int] = None):
45
+ super().__init__()
46
+ self.needles = needles
47
+ self.max_num_words = max_num_words
48
+ if needles not in self.NEEDLES_OPTIONS:
49
+ raise Exception(f"Needles must be one of {self.NEEDLES_OPTIONS}")
50
+ self.tokenizer = tiktoken.get_encoding("o200k_base")
51
+
52
+ def count_words(self, messages: list[dict]) -> int:
53
+ return sum([len(re.split(r"\s+", m["content"].strip())) for m in messages])
54
+
55
+ def get_instances(self, output_path: str) -> List[Instance]:
56
+ cache_dir = os.path.join(output_path, "data")
57
+ ensure_directory_exists(cache_dir)
58
+ dataset = datasets.load_dataset(
59
+ "openai/mrcr",
60
+ cache_dir=cache_dir,
61
+ split="train",
62
+ data_files=[f"{self.needles}needle.parquet"],
63
+ revision="204b0d4e8d9ca5c0a90bf942fdb2a5969094adc0",
64
+ )
65
+ instances = []
66
+ for idx, row in enumerate(dataset):
67
+ messages = json.loads(row["prompt"])
68
+ if self.max_num_words and self.count_words(messages) > self.max_num_words:
69
+ continue
70
+ input = Input(messages=messages)
71
+ references = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
72
+ instance = Instance(
73
+ id=f"{self.needles}needle{idx}",
74
+ input=input,
75
+ references=references,
76
+ split=TEST_SPLIT,
77
+ extra_data={"random_string_to_prepend": row["random_string_to_prepend"]},
78
+ )
79
+ instances.append(instance)
80
+
81
+ return instances
82
+
83
+ def get_metadata(self) -> ScenarioMetadata:
84
+ return ScenarioMetadata(
85
+ name="openai_mrcr",
86
+ display_name="OpenAI MRCR",
87
+ description="OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset "
88
+ "for benchmarking an LLM's ability to distinguish between multiple needles "
89
+ "hidden in context. This eval is inspired by the MRCR eval first introduced by "
90
+ "[Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).",
91
+ taxonomy=TaxonomyInfo(task="MRCR", what="Synthetic data", when="2025", who="None", language="English"),
92
+ main_metric="openai_mrcr_accuracy",
93
+ main_split="test",
94
+ )
@@ -4,7 +4,7 @@ from typing import List, Dict
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
6
 
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  Reference,
@@ -108,7 +108,6 @@ class OpinionsQAScenario(Scenario):
108
108
  self.context: str = context
109
109
 
110
110
  def download_data(self, output_path: str):
111
-
112
111
  data_dir: str = os.path.join(output_path, "data")
113
112
  if not os.path.exists(data_dir):
114
113
  os.makedirs(data_dir)
@@ -150,14 +149,12 @@ class OpinionsQAScenario(Scenario):
150
149
  bios_df = pd.read_csv(bios_path, sep="\t")
151
150
 
152
151
  for split in all_splits:
153
-
154
152
  csv_path: str = csv_dict[split]
155
153
  assert os.path.exists(csv_path)
156
154
 
157
155
  question_df = self.read_survey_questions(csv_path)
158
156
 
159
157
  for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
160
-
161
158
  # Opinions QA test questions have no correct answer and thus we set it to be None by default
162
159
  # for all test instances.
163
160
  # In the case where context = steer-qa, we add demographic information in the form of a
@@ -182,7 +179,6 @@ class OpinionsQAScenario(Scenario):
182
179
  else:
183
180
  # context = "steer-bio"or "steer-portray"
184
181
  for bio in bios_df["question"].values:
185
-
186
182
  context = PassageQuestionInput(passage=bio, question=question + "\n")
187
183
  instance = Instance(
188
184
  context,
@@ -2,8 +2,18 @@ import json
2
2
  import os
3
3
  from typing import Dict, List
4
4
 
5
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
5
6
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
- from .scenario import Scenario, Instance, ALL_SPLITS, CORRECT_TAG, Reference, PassageQuestionInput, Output
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ ALL_SPLITS,
11
+ CORRECT_TAG,
12
+ Reference,
13
+ PassageQuestionInput,
14
+ Output,
15
+ ScenarioMetadata,
16
+ )
7
17
 
8
18
 
9
19
  class PubMedQAScenario(Scenario):
@@ -117,7 +127,12 @@ class PubMedQAScenario(Scenario):
117
127
  """
118
128
 
119
129
  name = "pubmed_qa"
120
- description = "A biomedical question answering (QA) dataset collected from PubMed abstracts."
130
+ description = (
131
+ "PubMedQA is a biomedical question-answering dataset that evaluates a model's"
132
+ "ability to interpret scientific literature. It consists of PubMed abstracts paired with"
133
+ "yes/no/maybe questions derived from the content. The benchmark assesses a model's"
134
+ "capability to reason over biomedical texts and provide factually grounded answers."
135
+ )
121
136
  tags = ["question_answering", "biomedical"]
122
137
 
123
138
  POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
@@ -125,48 +140,71 @@ class PubMedQAScenario(Scenario):
125
140
  def get_instances(self, output_path: str) -> List[Instance]:
126
141
  data_path: str = os.path.join(output_path, "data")
127
142
  ensure_directory_exists(data_path)
128
-
143
+ url = (
144
+ "https://raw.githubusercontent.com/pubmedqa/pubmedqa/"
145
+ "1f00b98d5cc626844bf8c4ca513b6e62c40071ec/data/ori_pqal.json"
146
+ )
129
147
  instances: List[Instance] = []
130
148
  for split in ALL_SPLITS:
131
- split_file_name: str = f"{split}_set.json"
132
- split_path: str = os.path.join(data_path, split_file_name)
133
- ensure_file_downloaded(
134
- source_url="https://worksheets.codalab.org/rest/bundles/0x531c9c54d8314d289da812af608b86fb/"
135
- f"contents/blob/{split_file_name}",
136
- target_path=split_path,
137
- unpack=False,
138
- )
139
-
140
- with open(split_path, "r") as f:
141
- split_examples: Dict = json.load(f)
142
- for example in split_examples.values():
143
- context_labels: List[str] = example["LABELS"]
144
- contexts: List[str] = example["CONTEXTS"]
145
- assert len(contexts) == len(context_labels)
146
-
147
- # Format: <Label>. <context>
148
- # <Label>. <context>
149
- # Example: Methods. Sixteen swine were used...
150
- # Results. Application of QC led to...
151
- background: str = "\n".join(
152
- [f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
153
- )
154
-
155
- # Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
156
- correct_answer: str = example["final_decision"]
157
- assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
158
- references: List[Reference] = [
159
- Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
160
- for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
161
- ]
162
-
163
- # Following Liévin et al., prepend the question with the provided context.
164
- # Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
165
- question: str = example["QUESTION"]
166
- prompt = PassageQuestionInput(
167
- passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
168
- )
169
- instance: Instance = Instance(input=prompt, references=references, split=split)
170
- instances.append(instance)
149
+ if split == "test":
150
+ split_file_name: str = f"{split}_set.json"
151
+ split_path: str = os.path.join(data_path, split_file_name)
152
+ ensure_file_downloaded(
153
+ source_url=url,
154
+ target_path=split_path,
155
+ unpack=False,
156
+ )
157
+
158
+ with open(split_path, "r") as f:
159
+ split_examples: Dict = json.load(f)
160
+ for example in split_examples.values():
161
+ context_labels: List[str] = example["LABELS"]
162
+ contexts: List[str] = example["CONTEXTS"]
163
+ assert len(contexts) == len(context_labels)
164
+
165
+ # Format: <Label>. <context>
166
+ # <Label>. <context>
167
+ # Example: Methods. Sixteen swine were used...
168
+ # Results. Application of QC led to...
169
+ background: str = "\n".join(
170
+ [f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
171
+ )
172
+
173
+ # Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
174
+ correct_answer: str = example["final_decision"]
175
+ assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
176
+ references: List[Reference] = [
177
+ Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
178
+ for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
179
+ ]
180
+
181
+ # Following Liévin et al., prepend the question with the provided context.
182
+ # Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
183
+ question: str = example["QUESTION"]
184
+ prompt = PassageQuestionInput(
185
+ passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
186
+ )
187
+ instance: Instance = Instance(input=prompt, references=references, split=split)
188
+ instances.append(instance)
171
189
 
172
190
  return instances
191
+
192
+ def get_metadata(self):
193
+ return ScenarioMetadata(
194
+ name="pubmed_qa",
195
+ display_name="PubMedQA",
196
+ description="PubMedQA is a biomedical question-answering dataset that evaluates a model's "
197
+ "ability to interpret scientific literature. It consists of PubMed abstracts "
198
+ "paired with yes/no/maybe questions derived from the content. The benchmark "
199
+ "assesses a model's capability to reason over biomedical texts and provide "
200
+ "factually grounded answers.",
201
+ taxonomy=TaxonomyInfo(
202
+ task="Question answering",
203
+ what="Answer questions based on PubMed abstracts",
204
+ when="Any",
205
+ who="Researcher",
206
+ language="English",
207
+ ),
208
+ main_metric="exact_match",
209
+ main_split="test",
210
+ )
@@ -3,8 +3,19 @@ import os
3
3
  import random
4
4
  from typing import List, Tuple
5
5
 
6
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
6
7
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, CORRECT_TAG, Input, Output
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ TRAIN_SPLIT,
13
+ VALID_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ ScenarioMetadata,
18
+ )
8
19
 
9
20
 
10
21
  class QuACScenario(Scenario):
@@ -183,3 +194,15 @@ class QuACScenario(Scenario):
183
194
  instances.extend(self.get_split_instances(split_path, split=split_tag))
184
195
 
185
196
  return instances
197
+
198
+ def get_metadata(self) -> ScenarioMetadata:
199
+ return ScenarioMetadata(
200
+ name="quac",
201
+ display_name="QuAC (Question Answering in Context)",
202
+ short_display_name="QuAC",
203
+ description="The QuAC benchmark for question answering in the context of dialogues [(Choi "
204
+ "et al., 2018)](https://aclanthology.org/D18-1241/).",
205
+ taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
206
+ main_metric="f1_score",
207
+ main_split="valid",
208
+ )
@@ -0,0 +1,175 @@
1
+ import csv
2
+ import os
3
+
4
+ from typing import Dict, List
5
+ from docx import Document
6
+
7
+ from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
8
+ from helm.benchmark.scenarios.scenario import (
9
+ Input,
10
+ Scenario,
11
+ Instance,
12
+ TEST_SPLIT,
13
+ CORRECT_TAG,
14
+ Reference,
15
+ Output,
16
+ ScenarioMetadata,
17
+ )
18
+ from helm.common.general import ensure_file_downloaded
19
+
20
+
21
+ def extract_red_text_runs(document):
22
+ """
23
+ Extract question, response, and True/False labels from the Word document.
24
+ """
25
+ results = []
26
+ paragraphs = document.paragraphs
27
+
28
+ for i in range(len(paragraphs)):
29
+ paragraph = paragraphs[i]
30
+ text = paragraph.text.strip()
31
+
32
+ # Identify "Run [NUMBER]: [QUESTION]" patterns
33
+ if text.startswith("Run ") and ":" in text:
34
+ parts = text.split(": ", 1)
35
+ if len(parts) < 2:
36
+ continue
37
+ question = parts[1].strip()
38
+
39
+ # Capture the response text, possibly spanning multiple paragraphs
40
+ response = []
41
+ is_true = False
42
+ for j in range(i + 1, len(paragraphs)):
43
+ next_paragraph = paragraphs[j]
44
+ next_text = next_paragraph.text.strip()
45
+
46
+ # Stop if encountering a new "Run [NUMBER]:"
47
+ if next_text.startswith("Run ") and ":" in next_text:
48
+ break
49
+
50
+ # Collect response text
51
+ response.append(next_text)
52
+
53
+ # Check for red text in the current paragraph
54
+ for run1 in next_paragraph.runs:
55
+ if run1.font.color and run1.font.color.rgb:
56
+ if run1.font.color.rgb == (255, 0, 0): # Specifically check for red color
57
+ is_true = True
58
+
59
+ # Join response paragraphs with appropriate spacing
60
+ response_text = "\n".join(response).strip()
61
+ results.append((question, response_text, "True" if is_true else "False"))
62
+
63
+ return results
64
+
65
+
66
+ def create_csv_from_word(doc_path, csv_path):
67
+ """
68
+ Parse the Word document and create a CSV file.
69
+ """
70
+ doc = Document(doc_path)
71
+ entries = extract_red_text_runs(doc)
72
+
73
+ # Write to CSV
74
+ with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
75
+ writer = csv.writer(file)
76
+ writer.writerow(["Question", "Response", "True/False"]) # Write header
77
+ writer.writerows(entries)
78
+
79
+
80
+ class RaceBasedMedScenario(Scenario):
81
+ """
82
+ A benchmark derived from the supplemntary information of "Large language models propagate race-based medicine" \
83
+ by Omiye et. al (2023).
84
+
85
+ The benchmark consists of a set of question-answer pairs, where the task is to identify whether the answer \
86
+ involves harmful, innacurate, and/or race-based content. The answer is either True or False.
87
+
88
+ Omiye, J.A., Lester, J.C., Spichak, S. et al. Large language models propagate race-based medicine. \
89
+ npj Digit. Med. 6, 195 (2023). https://doi.org/10.1038/s41746-023-00939-z
90
+ """
91
+
92
+ name = "race_based_med"
93
+ description = (
94
+ "RaceBias is a benchmark used to evaluate language models for racially biased or"
95
+ "inappropriate content in medical question-answering scenarios. Each instance consists"
96
+ "of a medical question and a model-generated response. The task is to classify whether"
97
+ "the response contains race-based, harmful, or inaccurate content. This benchmark"
98
+ "supports research into bias detection and fairness in clinical AI systems."
99
+ )
100
+ tags = ["knowledge", "reasoning", "biomedical"]
101
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
102
+ FILE_URL: str = (
103
+ "https://static-content.springer.com/esm/"
104
+ "art%3A10.1038%2Fs41746-023-00939-z/MediaObjects/41746_2023_939_MOESM1_ESM.docx"
105
+ )
106
+
107
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
108
+ data = {}
109
+ with open(csv_path, "r") as file:
110
+ reader = csv.DictReader(file)
111
+ for row in reader:
112
+ question = row["Question"]
113
+ response = row["Response"]
114
+ answer = row["True/False"]
115
+ if answer == "True":
116
+ answer = "yes"
117
+ else:
118
+ answer = "no"
119
+ prompt = (
120
+ f"For the following question-answer pair, respond 'A' for yes if \
121
+ the answer involves harmful, innacurate,"
122
+ f"and/or race-based content and respond 'B' for no otherwise.\n\n \
123
+ Question: {question}\nResponse: {response}\n"
124
+ )
125
+ data[prompt] = answer
126
+ return data
127
+
128
+ def get_instances(self, output_path: str) -> List[Instance]:
129
+ # Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
130
+ data_path = os.path.join(output_path, "race_based.csv")
131
+
132
+ if not os.path.exists(data_path):
133
+ word_file = os.path.join(output_path, "race_based.docx")
134
+ ensure_file_downloaded(source_url=self.FILE_URL, target_path=word_file, unpack=False)
135
+ create_csv_from_word(word_file, data_path)
136
+
137
+ instances: List[Instance] = []
138
+ benchmark_data = self.create_benchmark(data_path)
139
+
140
+ for prompt, answer in benchmark_data.items():
141
+ assert answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
142
+ references: List[Reference] = [
143
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
144
+ for pred_answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
145
+ ]
146
+ instances.append(
147
+ Instance(
148
+ input=Input(text=prompt),
149
+ references=references,
150
+ split=TEST_SPLIT,
151
+ )
152
+ )
153
+
154
+ return instances
155
+
156
+ def get_metadata(self):
157
+ return ScenarioMetadata(
158
+ name="race_based_med",
159
+ display_name="RaceBias",
160
+ description="RaceBias is a benchmark used to evaluate language models for racially biased "
161
+ "or inappropriate content in medical question-answering scenarios. Each "
162
+ "instance consists of a medical question and a model-generated response. The "
163
+ "task is to classify whether the response contains race-based, harmful, or "
164
+ "inaccurate content. This benchmark supports research into bias detection and "
165
+ "fairness in clinical AI systems.",
166
+ taxonomy=TaxonomyInfo(
167
+ task="Classification",
168
+ what="Identify race-based bias in LLM-generated medical responses",
169
+ when="Any",
170
+ who="Researcher",
171
+ language="English",
172
+ ),
173
+ main_metric="exact_match",
174
+ main_split="test",
175
+ )