crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (1033) hide show
  1. crfm_helm-0.5.10.dist-info/METADATA +369 -0
  2. crfm_helm-0.5.10.dist-info/RECORD +1008 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +80 -29
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  8. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  9. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  10. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
  11. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
  12. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
  13. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  14. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
  15. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  16. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
  17. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
  18. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
  19. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  20. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
  21. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  22. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  23. helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
  24. helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
  25. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
  26. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
  27. helm/benchmark/adaptation/common_adapter_specs.py +443 -0
  28. helm/benchmark/adaptation/prompt.py +1 -1
  29. helm/benchmark/adaptation/request_state.py +6 -1
  30. helm/benchmark/adaptation/scenario_state.py +6 -2
  31. helm/benchmark/annotation/aci_bench_annotator.py +84 -0
  32. helm/benchmark/annotation/air_bench_annotator.py +79 -0
  33. helm/benchmark/annotation/alrage_annotator.py +90 -0
  34. helm/benchmark/annotation/annotator.py +48 -0
  35. helm/benchmark/annotation/annotator_factory.py +50 -0
  36. helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
  37. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  38. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  39. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  40. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  41. helm/benchmark/annotation/call_center_annotator.py +258 -0
  42. helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
  43. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  44. helm/benchmark/annotation/dischargeme_annotator.py +96 -0
  45. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  46. helm/benchmark/annotation/financebench_annotator.py +79 -0
  47. helm/benchmark/annotation/harm_bench_annotator.py +55 -0
  48. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  49. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
  50. helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
  51. helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
  52. helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
  53. helm/benchmark/annotation/live_qa_annotator.py +76 -0
  54. helm/benchmark/annotation/med_dialog_annotator.py +88 -0
  55. helm/benchmark/annotation/medalign_annotator.py +89 -0
  56. helm/benchmark/annotation/medi_qa_annotator.py +87 -0
  57. helm/benchmark/annotation/medication_qa_annotator.py +86 -0
  58. helm/benchmark/annotation/mental_health_annotator.py +87 -0
  59. helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
  60. helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
  61. helm/benchmark/annotation/model_as_judge.py +309 -0
  62. helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
  63. helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
  64. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  65. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  66. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  67. helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
  68. helm/benchmark/annotation/spider_annotator.py +18 -0
  69. helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
  70. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  71. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  72. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  73. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  74. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  75. helm/benchmark/annotation/xstest_annotator.py +100 -0
  76. helm/benchmark/annotation_executor.py +144 -0
  77. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  78. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  79. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  80. helm/benchmark/augmentations/data_augmenter.py +0 -2
  81. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  82. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  83. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  84. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  85. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  86. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  87. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  88. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  89. helm/benchmark/augmentations/perturbation.py +26 -4
  90. helm/benchmark/augmentations/perturbation_description.py +1 -1
  91. helm/benchmark/augmentations/space_perturbation.py +2 -2
  92. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  93. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  94. helm/benchmark/augmentations/test_perturbation.py +56 -19
  95. helm/benchmark/augmentations/translate_perturbation.py +31 -0
  96. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  97. helm/benchmark/config_registry.py +7 -1
  98. helm/benchmark/data_preprocessor.py +2 -2
  99. helm/benchmark/executor.py +54 -25
  100. helm/benchmark/huggingface_registration.py +28 -10
  101. helm/benchmark/metrics/air_bench_metrics.py +3212 -0
  102. helm/benchmark/metrics/alrage_metric.py +35 -0
  103. helm/benchmark/metrics/annotation_metrics.py +108 -0
  104. helm/benchmark/metrics/basic_metrics.py +437 -667
  105. helm/benchmark/metrics/bbq_metrics.py +17 -6
  106. helm/benchmark/metrics/bias_metrics.py +18 -9
  107. helm/benchmark/metrics/bias_word_lists.py +1 -1
  108. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  109. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  110. helm/benchmark/metrics/classification_metrics.py +107 -22
  111. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  112. helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
  113. helm/benchmark/metrics/code_metrics.py +5 -5
  114. helm/benchmark/metrics/code_metrics_helper.py +11 -3
  115. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  116. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  117. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  118. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  119. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  120. helm/benchmark/metrics/comet_metric.py +125 -0
  121. helm/benchmark/metrics/common_metric_specs.py +174 -0
  122. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
  123. helm/benchmark/metrics/copyright_metrics.py +5 -5
  124. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  125. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  126. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  127. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  128. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  129. helm/benchmark/metrics/disinformation_metrics.py +8 -114
  130. helm/benchmark/metrics/dry_run_metrics.py +35 -6
  131. helm/benchmark/metrics/efficiency_metrics.py +287 -0
  132. helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
  133. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  134. helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
  135. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  136. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  137. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
  138. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  139. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  140. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  141. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
  142. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  143. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  144. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  145. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  146. helm/benchmark/metrics/ifeval_metrics.py +67 -0
  147. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  148. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  149. helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
  150. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  151. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  152. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  153. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  154. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  155. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  156. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  157. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  158. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  159. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  160. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  161. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  162. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  163. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  164. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  165. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  166. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  167. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  168. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  169. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  170. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  171. helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
  172. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  173. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  174. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  175. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  176. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  177. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  178. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  179. helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
  180. helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
  181. helm/benchmark/metrics/language_modeling_metrics.py +111 -0
  182. helm/benchmark/metrics/live_qa_metrics.py +35 -0
  183. helm/benchmark/metrics/llm_jury_metrics.py +58 -0
  184. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  185. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  186. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  187. helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
  188. helm/benchmark/metrics/medec_metrics.py +124 -0
  189. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  190. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  191. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  192. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  193. helm/benchmark/metrics/metric.py +121 -175
  194. helm/benchmark/metrics/metric_name.py +0 -1
  195. helm/benchmark/metrics/metric_service.py +23 -7
  196. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
  197. helm/benchmark/metrics/nltk_helper.py +32 -0
  198. helm/benchmark/metrics/omni_math_metrics.py +44 -0
  199. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  200. helm/benchmark/metrics/output_processing_metric.py +60 -0
  201. helm/benchmark/metrics/output_processors.py +15 -0
  202. helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
  203. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  204. helm/benchmark/metrics/ranking_metrics.py +5 -5
  205. helm/benchmark/metrics/reference_metric.py +148 -0
  206. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  207. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  208. helm/benchmark/metrics/safety_metrics.py +91 -0
  209. helm/benchmark/metrics/seahelm_metrics.py +201 -0
  210. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  211. helm/benchmark/metrics/spider_metrics.py +7 -0
  212. helm/benchmark/metrics/statistic.py +1 -1
  213. helm/benchmark/metrics/summac/model_summac.py +8 -11
  214. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  215. helm/benchmark/metrics/summarization_metrics.py +150 -11
  216. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  217. helm/benchmark/metrics/test_classification_metrics.py +145 -70
  218. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  219. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
  220. helm/benchmark/metrics/test_metric.py +3 -3
  221. helm/benchmark/metrics/test_statistic.py +2 -2
  222. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  223. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  224. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  225. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  226. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
  227. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  228. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
  229. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
  230. helm/benchmark/metrics/toxicity_metrics.py +37 -7
  231. helm/benchmark/metrics/toxicity_utils.py +23 -0
  232. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  233. helm/benchmark/metrics/unitxt_metrics.py +107 -0
  234. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  235. helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
  236. helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
  237. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  238. helm/benchmark/metrics/wildbench_metrics.py +54 -0
  239. helm/benchmark/model_deployment_registry.py +69 -5
  240. helm/benchmark/model_metadata_registry.py +58 -2
  241. helm/benchmark/multi_gpu_runner.py +133 -0
  242. helm/benchmark/presentation/contamination.py +3 -3
  243. helm/benchmark/presentation/create_plots.py +51 -20
  244. helm/benchmark/presentation/run_display.py +51 -12
  245. helm/benchmark/presentation/run_entry.py +2 -2
  246. helm/benchmark/presentation/schema.py +83 -66
  247. helm/benchmark/presentation/summarize.py +483 -388
  248. helm/benchmark/presentation/table.py +8 -8
  249. helm/benchmark/presentation/taxonomy_info.py +20 -0
  250. helm/benchmark/presentation/test_contamination.py +2 -2
  251. helm/benchmark/presentation/test_create_plots.py +4 -1
  252. helm/benchmark/presentation/test_run_entry.py +2 -2
  253. helm/benchmark/presentation/test_schema.py +11 -0
  254. helm/benchmark/presentation/test_summarize.py +148 -6
  255. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  256. helm/benchmark/reeval_run.py +202 -0
  257. helm/benchmark/reeval_runner.py +355 -0
  258. helm/benchmark/run.py +151 -87
  259. helm/benchmark/run_expander.py +418 -33
  260. helm/benchmark/run_spec.py +93 -0
  261. helm/benchmark/run_spec_factory.py +180 -0
  262. helm/benchmark/run_specs/__init__.py +0 -0
  263. helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
  264. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  265. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  266. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  267. helm/benchmark/run_specs/call_center_run_specs.py +201 -0
  268. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  269. helm/benchmark/run_specs/classic_run_specs.py +1393 -0
  270. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  271. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  272. helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
  273. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  274. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  275. helm/benchmark/run_specs/experimental_run_specs.py +224 -0
  276. helm/benchmark/run_specs/finance_run_specs.py +114 -0
  277. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  278. helm/benchmark/run_specs/heim_run_specs.py +625 -0
  279. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  280. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  281. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  282. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  283. helm/benchmark/run_specs/long_context_run_specs.py +188 -0
  284. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  285. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  286. helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
  287. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  288. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  289. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  290. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  291. helm/benchmark/run_specs/safety_run_specs.py +191 -0
  292. helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
  293. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  294. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
  295. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  296. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  297. helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
  298. helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
  299. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  300. helm/benchmark/runner.py +63 -62
  301. helm/benchmark/runner_config_registry.py +21 -0
  302. helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
  303. helm/benchmark/scenarios/air_bench_scenario.py +76 -0
  304. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  305. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  306. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
  307. helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
  308. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  309. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  310. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  311. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  312. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  313. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  314. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  315. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  316. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  317. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  318. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  319. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  320. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  321. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  322. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  323. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  324. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  325. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  326. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  327. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  328. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  329. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  330. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  331. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  332. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  333. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  334. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  335. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
  336. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  337. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
  338. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  339. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  340. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  341. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  342. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  343. helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
  344. helm/benchmark/scenarios/banking77_scenario.py +77 -0
  345. helm/benchmark/scenarios/bbq_scenario.py +17 -2
  346. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  347. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  348. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  349. helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
  350. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  351. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  352. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  353. helm/benchmark/scenarios/bold_scenario.py +18 -3
  354. helm/benchmark/scenarios/boolq_scenario.py +21 -1
  355. helm/benchmark/scenarios/call_center_scenario.py +84 -0
  356. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  357. helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
  358. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  359. helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
  360. helm/benchmark/scenarios/clear_scenario.py +180 -0
  361. helm/benchmark/scenarios/cleva_scenario.py +482 -3
  362. helm/benchmark/scenarios/code_scenario.py +46 -4
  363. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  364. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  365. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  366. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  367. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  368. helm/benchmark/scenarios/commonsense_scenario.py +33 -1
  369. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  370. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
  371. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  372. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  373. helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
  374. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  375. helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
  376. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
  377. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
  378. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
  379. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
  380. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
  381. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
  382. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
  383. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
  384. helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
  385. helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
  386. helm/benchmark/scenarios/disinformation_scenario.py +32 -1
  387. helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
  388. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  389. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  390. helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
  391. helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
  392. helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
  393. helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
  394. helm/benchmark/scenarios/ewok_scenario.py +116 -0
  395. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  396. helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
  397. helm/benchmark/scenarios/financebench_scenario.py +74 -0
  398. helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
  399. helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
  400. helm/benchmark/scenarios/gpqa_scenario.py +98 -0
  401. helm/benchmark/scenarios/grammar.py +2 -2
  402. helm/benchmark/scenarios/grammar_scenario.py +21 -2
  403. helm/benchmark/scenarios/gsm_scenario.py +31 -1
  404. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
  405. helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
  406. helm/benchmark/scenarios/headqa_scenario.py +158 -0
  407. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  408. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
  409. helm/benchmark/scenarios/ice_scenario.py +28 -4
  410. helm/benchmark/scenarios/ifeval_scenario.py +71 -0
  411. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  412. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  413. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  414. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  415. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  416. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  417. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  418. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  419. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  420. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  421. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  422. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  423. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  424. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  425. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  426. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  427. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  428. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  429. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  430. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  431. helm/benchmark/scenarios/imdb_scenario.py +26 -3
  432. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  433. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  434. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
  435. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  436. helm/benchmark/scenarios/koala_scenario.py +21 -1
  437. helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
  438. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
  439. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  440. helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
  441. helm/benchmark/scenarios/legal_support_scenario.py +24 -1
  442. helm/benchmark/scenarios/legalbench_scenario.py +45 -3
  443. helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
  444. helm/benchmark/scenarios/lextreme_scenario.py +22 -1
  445. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  446. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  447. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  448. helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
  449. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  450. helm/benchmark/scenarios/math_scenario.py +81 -22
  451. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  452. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  453. helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
  454. helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
  455. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  456. helm/benchmark/scenarios/med_qa_scenario.py +30 -1
  457. helm/benchmark/scenarios/medalign_scenario.py +117 -0
  458. helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
  459. helm/benchmark/scenarios/medbullets_scenario.py +167 -0
  460. helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
  461. helm/benchmark/scenarios/medec_scenario.py +148 -0
  462. helm/benchmark/scenarios/medhallu_scenario.py +95 -0
  463. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  464. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  465. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  466. helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
  467. helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
  468. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  469. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  470. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  471. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  472. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  473. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  474. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  475. helm/benchmark/scenarios/mental_health_scenario.py +146 -0
  476. helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
  477. helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
  478. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
  479. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  480. helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
  481. helm/benchmark/scenarios/mmlu_scenario.py +32 -1
  482. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  483. helm/benchmark/scenarios/msmarco_scenario.py +31 -1
  484. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
  485. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
  486. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
  487. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
  488. helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
  489. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  490. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  491. helm/benchmark/scenarios/omni_math_scenario.py +71 -0
  492. helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
  493. helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
  494. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
  495. helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
  496. helm/benchmark/scenarios/quac_scenario.py +24 -1
  497. helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
  498. helm/benchmark/scenarios/raft_scenario.py +33 -3
  499. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  500. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  501. helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
  502. helm/benchmark/scenarios/scenario.py +44 -1
  503. helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
  504. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  505. helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
  506. helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
  507. helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
  508. helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
  509. helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
  510. helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
  511. helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
  512. helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
  513. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  514. helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
  515. helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
  516. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  517. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  518. helm/benchmark/scenarios/spider_scenario.py +109 -0
  519. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
  520. helm/benchmark/scenarios/summarization_scenario.py +48 -1
  521. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  522. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  523. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
  524. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  525. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  526. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  527. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  528. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  529. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  530. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  531. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  532. helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
  533. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  534. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  535. helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
  536. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  537. helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
  538. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  539. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  540. helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
  541. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  542. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  543. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  544. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  545. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  546. helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
  547. helm/benchmark/scenarios/test_math_scenario.py +4 -3
  548. helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
  549. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  550. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  551. helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
  552. helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
  553. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  554. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  555. helm/benchmark/scenarios/test_scenario.py +6 -3
  556. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  557. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  558. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  559. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  560. helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
  561. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  562. helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
  563. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  564. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  565. helm/benchmark/scenarios/unitxt_scenario.py +62 -0
  566. helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
  567. helm/benchmark/scenarios/vicuna_scenario.py +22 -2
  568. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  569. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  570. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  571. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
  572. helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
  573. helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
  574. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  575. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  576. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  577. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  578. helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
  579. helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
  580. helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
  581. helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
  582. helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
  583. helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
  584. helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
  585. helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
  586. helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
  587. helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
  588. helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
  589. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  590. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  591. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  592. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  593. helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
  594. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  595. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  596. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  597. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  598. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  599. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  600. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  601. helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
  602. helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
  603. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
  604. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  605. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
  606. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
  607. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  608. helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
  609. helm/benchmark/scenarios/wikifact_scenario.py +31 -1
  610. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  611. helm/benchmark/scenarios/wildbench_scenario.py +101 -0
  612. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  613. helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
  614. helm/benchmark/scenarios/xstest_scenario.py +35 -0
  615. helm/benchmark/server.py +32 -2
  616. helm/benchmark/slurm_jobs.py +1 -2
  617. helm/benchmark/slurm_runner.py +78 -50
  618. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  619. helm/benchmark/static/schema_arabic.yaml +271 -0
  620. helm/benchmark/static/schema_audio.yaml +763 -0
  621. helm/benchmark/static/schema_autobencher.yaml +150 -0
  622. helm/benchmark/static/schema_call_center.yaml +269 -0
  623. helm/benchmark/static/schema_capabilities.yaml +254 -0
  624. helm/benchmark/static/schema_classic.yaml +259 -1140
  625. helm/benchmark/static/schema_cleva.yaml +768 -0
  626. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  627. helm/benchmark/static/schema_decodingtrust.yaml +444 -0
  628. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  629. helm/benchmark/static/schema_enterprise.yaml +319 -0
  630. helm/benchmark/static/schema_ewok.yaml +367 -0
  631. helm/benchmark/static/schema_finance.yaml +191 -0
  632. helm/benchmark/static/schema_heim.yaml +1389 -0
  633. helm/benchmark/static/schema_image2struct.yaml +588 -0
  634. helm/benchmark/static/schema_instruction_following.yaml +161 -0
  635. helm/benchmark/static/schema_legal.yaml +566 -0
  636. helm/benchmark/static/schema_lite.yaml +3 -286
  637. helm/benchmark/static/schema_long_context.yaml +282 -0
  638. helm/benchmark/static/schema_medhelm.yaml +1176 -0
  639. helm/benchmark/static/schema_melt.yaml +1257 -0
  640. helm/benchmark/static/schema_mmlu.yaml +1449 -0
  641. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  642. helm/benchmark/static/schema_safety.yaml +283 -0
  643. helm/benchmark/static/schema_seahelm.yaml +723 -0
  644. helm/benchmark/static/schema_slp.yaml +219 -0
  645. helm/benchmark/static/schema_slphelm.yaml +162 -0
  646. helm/benchmark/static/schema_social_audio.yaml +224 -0
  647. helm/benchmark/static/schema_sql.yaml +171 -0
  648. helm/benchmark/static/schema_thai.yaml +244 -0
  649. helm/benchmark/static/schema_torr.yaml +474 -0
  650. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  651. helm/benchmark/static/schema_unitxt.yaml +370 -0
  652. helm/benchmark/static/schema_vhelm.yaml +933 -0
  653. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  654. helm/benchmark/static/schema_video.yaml +219 -0
  655. helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
  656. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  657. helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
  658. helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
  659. helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
  660. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  661. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  662. helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
  663. helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
  664. helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
  665. helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
  666. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  667. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  668. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  669. helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
  670. helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
  671. helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
  672. helm/benchmark/static_build/config.js +4 -0
  673. helm/benchmark/static_build/index.html +19 -0
  674. helm/benchmark/test_data_preprocessor.py +3 -3
  675. helm/benchmark/test_run_expander.py +1 -1
  676. helm/benchmark/window_services/default_window_service.py +3 -45
  677. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
  678. helm/benchmark/window_services/ice_window_service.py +1 -35
  679. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  680. helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
  681. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  682. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  683. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  684. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  685. helm/benchmark/window_services/local_window_service.py +22 -5
  686. helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
  687. helm/benchmark/window_services/test_bloom_window_service.py +5 -4
  688. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  689. helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
  690. helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
  691. helm/benchmark/window_services/test_gptj_window_service.py +11 -5
  692. helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
  693. helm/benchmark/window_services/test_openai_window_service.py +18 -12
  694. helm/benchmark/window_services/test_opt_window_service.py +6 -5
  695. helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
  696. helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
  697. helm/benchmark/window_services/test_t511b_window_service.py +5 -4
  698. helm/benchmark/window_services/test_ul2_window_service.py +5 -4
  699. helm/benchmark/window_services/test_utils.py +6 -6
  700. helm/benchmark/window_services/test_yalm_window_service.py +5 -4
  701. helm/benchmark/window_services/tokenizer_service.py +7 -13
  702. helm/benchmark/window_services/window_service.py +42 -0
  703. helm/benchmark/window_services/window_service_factory.py +4 -1
  704. helm/benchmark/window_services/yalm_window_service.py +1 -28
  705. helm/clients/__init__.py +0 -0
  706. helm/{proxy/clients → clients}/ai21_client.py +78 -12
  707. helm/clients/aleph_alpha_client.py +114 -0
  708. helm/{proxy/clients → clients}/anthropic_client.py +304 -21
  709. helm/clients/audio_language/__init__.py +0 -0
  710. helm/clients/audio_language/diva_llama_client.py +122 -0
  711. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  712. helm/clients/audio_language/llama_omni/constants.py +9 -0
  713. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  714. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  715. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  716. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  717. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  718. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  719. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  720. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  721. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  722. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  723. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  724. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  725. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  726. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  727. helm/clients/audio_language/llama_omni/utils.py +202 -0
  728. helm/clients/audio_language/llama_omni_client.py +199 -0
  729. helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
  730. helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
  731. helm/clients/audio_language/qwen_audiolm_client.py +153 -0
  732. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  733. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  734. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  735. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  736. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  737. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  738. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  739. helm/clients/audio_language/test.py +62 -0
  740. helm/{proxy/clients → clients}/auto_client.py +72 -31
  741. helm/clients/azure_openai_client.py +55 -0
  742. helm/clients/bedrock_client.py +381 -0
  743. helm/clients/bedrock_utils.py +105 -0
  744. helm/{proxy/clients → clients}/client.py +92 -17
  745. helm/clients/clip_score_client.py +49 -0
  746. helm/clients/clip_scorers/__init__.py +0 -0
  747. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  748. helm/clients/clip_scorers/clip_scorer.py +50 -0
  749. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  750. helm/{proxy/clients → clients}/cohere_client.py +105 -14
  751. helm/clients/dspy_client.py +135 -0
  752. helm/clients/gcs_client.py +82 -0
  753. helm/{proxy/clients → clients}/google_client.py +8 -6
  754. helm/clients/google_translate_client.py +35 -0
  755. helm/clients/grok_client.py +36 -0
  756. helm/{proxy/clients → clients}/http_model_client.py +8 -8
  757. helm/{proxy/clients → clients}/huggingface_client.py +157 -86
  758. helm/clients/huggingface_pipeline_client.py +138 -0
  759. helm/clients/ibm_client.py +269 -0
  760. helm/clients/image_generation/__init__.py +0 -0
  761. helm/clients/image_generation/adobe_vision_client.py +80 -0
  762. helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
  763. helm/clients/image_generation/cogview2/__init__.py +0 -0
  764. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  765. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  766. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  767. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
  768. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  769. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  770. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
  771. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  772. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  773. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  774. helm/clients/image_generation/cogview2_client.py +192 -0
  775. helm/clients/image_generation/dalle2_client.py +194 -0
  776. helm/clients/image_generation/dalle3_client.py +108 -0
  777. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  778. helm/clients/image_generation/dalle_mini/data.py +442 -0
  779. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  780. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  781. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  782. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  783. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  784. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  785. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  786. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  787. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  788. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  789. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  790. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  791. helm/clients/image_generation/dalle_mini_client.py +191 -0
  792. helm/clients/image_generation/deep_floyd_client.py +80 -0
  793. helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
  794. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  795. helm/clients/image_generation/lexica_client.py +88 -0
  796. helm/clients/image_generation/mindalle/__init__.py +0 -0
  797. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  798. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  799. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  800. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  801. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  802. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  803. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  804. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  805. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  806. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  807. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  808. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  809. helm/clients/image_generation/mindalle_client.py +116 -0
  810. helm/clients/image_generation/nudity_check_client.py +64 -0
  811. helm/clients/image_generation/together_image_generation_client.py +113 -0
  812. helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
  813. helm/{proxy/clients → clients}/megatron_client.py +7 -5
  814. helm/clients/mistral_client.py +180 -0
  815. helm/clients/moderation_api_client.py +111 -0
  816. helm/clients/nvidia_nim_client.py +32 -0
  817. helm/clients/open_lm_client.py +43 -0
  818. helm/clients/openai_client.py +604 -0
  819. helm/clients/openai_responses_client.py +200 -0
  820. helm/clients/openrouter_client.py +31 -0
  821. helm/{proxy/clients → clients}/palmyra_client.py +31 -14
  822. helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
  823. helm/clients/reka_client.py +190 -0
  824. helm/clients/simple_client.py +64 -0
  825. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  826. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  827. helm/clients/stanfordhealthcare_google_client.py +43 -0
  828. helm/clients/stanfordhealthcare_http_model_client.py +95 -0
  829. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  830. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  831. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  832. helm/clients/test_client.py +98 -0
  833. helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
  834. helm/clients/test_openrouter_client.py +69 -0
  835. helm/clients/test_simple_client.py +19 -0
  836. helm/clients/test_together_client.py +184 -0
  837. helm/clients/together_client.py +599 -0
  838. helm/clients/upstage_client.py +23 -0
  839. helm/clients/vertexai_client.py +488 -0
  840. helm/clients/vision_language/__init__.py +0 -0
  841. helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
  842. helm/clients/vision_language/huggingface_vlm_client.py +114 -0
  843. helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
  844. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  845. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  846. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  847. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  848. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  849. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  850. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  851. helm/clients/vision_language/open_flamingo_client.py +155 -0
  852. helm/clients/vision_language/paligemma_client.py +147 -0
  853. helm/clients/vision_language/palmyra_vision_client.py +101 -0
  854. helm/clients/vision_language/qwen2_vlm_client.py +189 -0
  855. helm/clients/vision_language/qwen_vlm_client.py +174 -0
  856. helm/clients/vllm_client.py +80 -0
  857. helm/clients/vllm_granite_thinking_client.py +56 -0
  858. helm/clients/writer_client.py +105 -0
  859. helm/clients/yi_client.py +28 -0
  860. helm/common/audio_utils.py +111 -0
  861. helm/common/cache.py +23 -33
  862. helm/common/cache_backend_config.py +47 -0
  863. helm/common/clip_score_request.py +41 -0
  864. helm/common/context.py +80 -0
  865. helm/common/credentials_utils.py +5 -5
  866. helm/common/critique_request.py +10 -2
  867. helm/common/file_caches/__init__.py +0 -0
  868. helm/common/file_caches/file_cache.py +16 -0
  869. helm/common/file_caches/local_file_cache.py +61 -0
  870. helm/common/file_caches/test_local_file_cache.py +25 -0
  871. helm/common/file_upload_request.py +27 -0
  872. helm/common/general.py +10 -3
  873. helm/common/hierarchical_logger.py +124 -12
  874. helm/common/image_generation_parameters.py +25 -0
  875. helm/common/images_utils.py +60 -5
  876. helm/common/key_value_store.py +41 -10
  877. helm/common/local_context.py +140 -0
  878. helm/common/media_object.py +14 -1
  879. helm/common/moderations_api_request.py +71 -0
  880. helm/common/mongo_key_value_store.py +8 -7
  881. helm/common/multimodal_request_utils.py +57 -0
  882. helm/common/nudity_check_request.py +29 -0
  883. helm/common/object_spec.py +23 -8
  884. helm/common/optional_dependencies.py +1 -1
  885. helm/common/reeval_parameters.py +12 -0
  886. helm/common/remote_context.py +61 -0
  887. helm/common/request.py +45 -19
  888. helm/common/response_format.py +18 -0
  889. helm/common/test_cache.py +1 -48
  890. helm/common/test_general.py +10 -0
  891. helm/common/test_logging.py +94 -0
  892. helm/common/test_media_object.py +1 -1
  893. helm/common/tokenization_request.py +1 -10
  894. helm/config/model_deployments.yaml +4713 -1005
  895. helm/config/model_metadata.yaml +4045 -255
  896. helm/config/tokenizer_configs.yaml +1091 -50
  897. helm/proxy/accounts.py +31 -4
  898. helm/proxy/cli.py +6 -4
  899. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  900. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  901. helm/proxy/critique/model_critique_client.py +40 -10
  902. helm/proxy/example_queries.py +33 -28
  903. helm/proxy/retry.py +5 -0
  904. helm/proxy/server.py +82 -18
  905. helm/proxy/services/remote_service.py +32 -7
  906. helm/proxy/services/server_service.py +71 -69
  907. helm/proxy/services/service.py +30 -6
  908. helm/proxy/services/test_remote_service.py +6 -5
  909. helm/proxy/services/test_service.py +1 -13
  910. helm/proxy/static/help.html +99 -0
  911. helm/proxy/static/index.css +61 -0
  912. helm/proxy/static/index.html +40 -0
  913. helm/proxy/static/index.js +462 -0
  914. helm/proxy/test_accounts.py +32 -0
  915. helm/proxy/test_retry.py +1 -1
  916. helm/proxy/token_counters/auto_token_counter.py +37 -37
  917. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  918. helm/proxy/token_counters/token_counter.py +3 -5
  919. helm/tokenizers/__init__.py +0 -0
  920. helm/tokenizers/ai21_tokenizer.py +52 -0
  921. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
  922. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
  923. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
  924. helm/tokenizers/cohere_tokenizer.py +50 -0
  925. helm/tokenizers/grok_tokenizer.py +55 -0
  926. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
  927. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
  928. helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
  929. helm/tokenizers/simple_tokenizer.py +33 -0
  930. helm/tokenizers/test_ai21_tokenizer.py +48 -0
  931. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
  932. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  933. helm/tokenizers/test_grok_tokenizer.py +33 -0
  934. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
  935. helm/tokenizers/test_simple_tokenizer.py +33 -0
  936. helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
  937. helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
  938. helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
  939. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  940. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
  941. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  942. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  943. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  944. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  945. crfm_helm-0.4.0.dist-info/METADATA +0 -264
  946. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  947. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  948. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  949. helm/benchmark/data_overlap/light_scenario.py +0 -60
  950. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  951. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  952. helm/benchmark/run_specs.py +0 -2762
  953. helm/benchmark/scenarios/numeracy_scenario.py +0 -784
  954. helm/benchmark/static/benchmarking.css +0 -156
  955. helm/benchmark/static/benchmarking.js +0 -1705
  956. helm/benchmark/static/config.js +0 -3
  957. helm/benchmark/static/images/helm-logo.png +0 -0
  958. helm/benchmark/static/images/language-model-helm.png +0 -0
  959. helm/benchmark/static/images/organizations/ai21.png +0 -0
  960. helm/benchmark/static/images/organizations/anthropic.png +0 -0
  961. helm/benchmark/static/images/organizations/bigscience.png +0 -0
  962. helm/benchmark/static/images/organizations/cohere.png +0 -0
  963. helm/benchmark/static/images/organizations/eleutherai.png +0 -0
  964. helm/benchmark/static/images/organizations/google.png +0 -0
  965. helm/benchmark/static/images/organizations/meta.png +0 -0
  966. helm/benchmark/static/images/organizations/microsoft.png +0 -0
  967. helm/benchmark/static/images/organizations/nvidia.png +0 -0
  968. helm/benchmark/static/images/organizations/openai.png +0 -0
  969. helm/benchmark/static/images/organizations/together.png +0 -0
  970. helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
  971. helm/benchmark/static/images/organizations/yandex.png +0 -0
  972. helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
  973. helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
  974. helm/benchmark/static/index.html +0 -68
  975. helm/benchmark/static/json-urls.js +0 -69
  976. helm/benchmark/static/plot-captions.js +0 -27
  977. helm/benchmark/static/utils.js +0 -285
  978. helm/benchmark/test_model_deployment_definition.py +0 -92
  979. helm/benchmark/test_model_properties.py +0 -1570
  980. helm/benchmark/vlm_run_specs.py +0 -97
  981. helm/benchmark/window_services/ai21_window_service.py +0 -258
  982. helm/benchmark/window_services/cohere_window_service.py +0 -163
  983. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  984. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  985. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  986. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  987. helm/benchmark/window_services/t511b_window_service.py +0 -30
  988. helm/benchmark/window_services/test_ai21_window_service.py +0 -163
  989. helm/benchmark/window_services/test_cohere_window_service.py +0 -74
  990. helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
  991. helm/benchmark/window_services/test_ice_window_service.py +0 -326
  992. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  993. helm/benchmark/window_services/ul2_window_service.py +0 -30
  994. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  995. helm/common/cache_utils.py +0 -14
  996. helm/proxy/clients/aleph_alpha_client.py +0 -95
  997. helm/proxy/clients/goose_ai_client.py +0 -99
  998. helm/proxy/clients/microsoft_client.py +0 -180
  999. helm/proxy/clients/openai_client.py +0 -206
  1000. helm/proxy/clients/simple_client.py +0 -60
  1001. helm/proxy/clients/test_client.py +0 -49
  1002. helm/proxy/clients/test_together_client.py +0 -97
  1003. helm/proxy/clients/together_client.py +0 -334
  1004. helm/proxy/clients/vertexai_client.py +0 -115
  1005. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  1006. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  1007. helm/proxy/token_counters/free_token_counter.py +0 -12
  1008. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  1009. helm/proxy/token_counters/openai_token_counter.py +0 -22
  1010. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  1011. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  1012. helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
  1013. helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
  1014. helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
  1015. helm/proxy/tokenizers/ice_tokenizer.py +0 -30
  1016. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  1017. helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
  1018. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  1019. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
  1020. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  1021. /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
  1022. /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
  1023. /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
  1024. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  1025. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  1026. /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
  1027. /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
  1028. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  1029. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  1030. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  1031. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  1032. /helm/{benchmark → proxy}/static/general.js +0 -0
  1033. /helm/{benchmark → proxy}/static/info-icon.png +0 -0
@@ -0,0 +1,1449 @@
1
+ ---
2
+ ############################################################
3
+ # For backwards compatibility with older versions of HELM.
4
+ # TODO: Remove this after 2024-09-01.
5
+ adapter: []
6
+ ############################################################
7
+ metrics:
8
+ # Infrastructure metrics:
9
+ - name: num_perplexity_tokens
10
+ display_name: '# tokens'
11
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
12
+ - name: num_bytes
13
+ display_name: '# bytes'
14
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
15
+
16
+ - name: num_references
17
+ display_name: '# ref'
18
+ description: Number of references.
19
+ - name: num_train_trials
20
+ display_name: '# trials'
21
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
22
+ - name: estimated_num_tokens_cost
23
+ display_name: 'cost'
24
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
25
+ - name: num_prompt_tokens
26
+ display_name: '# prompt tokens'
27
+ description: Number of tokens in the prompt.
28
+ - name: num_prompt_characters
29
+ display_name: '# prompt chars'
30
+ description: Number of characters in the prompt.
31
+ - name: num_completion_tokens
32
+ display_name: '# completion tokens'
33
+ description: Actual number of completion tokens (over all completions).
34
+ - name: num_output_tokens
35
+ display_name: '# output tokens'
36
+ description: Actual number of output tokens.
37
+ - name: max_num_output_tokens
38
+ display_name: 'Max output tokens'
39
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
40
+ - name: num_requests
41
+ display_name: '# requests'
42
+ description: Number of distinct API requests.
43
+ - name: num_instances
44
+ display_name: '# eval'
45
+ description: Number of evaluation instances.
46
+ - name: num_train_instances
47
+ display_name: '# train'
48
+ description: Number of training instances (e.g., in-context examples).
49
+ - name: prompt_truncated
50
+ display_name: truncated
51
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
52
+ - name: finish_reason_length
53
+ display_name: finish b/c length
54
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
55
+ - name: finish_reason_stop
56
+ display_name: finish b/c stop
57
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
58
+ - name: finish_reason_endoftext
59
+ display_name: finish b/c endoftext
60
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
61
+ - name: finish_reason_unknown
62
+ display_name: finish b/c unknown
63
+ description: Fraction of instances where the the output was terminated for unknown reasons.
64
+ - name: num_completions
65
+ display_name: '# completions'
66
+ description: Number of completions.
67
+ - name: predicted_index
68
+ display_name: Predicted index
69
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
70
+
71
+ # Accuracy metrics:
72
+ - name: exact_match
73
+ display_name: Exact match
74
+ short_display_name: EM
75
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
76
+ lower_is_better: false
77
+ - name: quasi_exact_match
78
+ display_name: Quasi-exact match
79
+ short_display_name: EM
80
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
81
+ lower_is_better: false
82
+ - name: prefix_exact_match
83
+ display_name: Prefix exact match
84
+ short_display_name: PEM
85
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
86
+ lower_is_better: false
87
+ - name: quasi_prefix_exact_match
88
+ # TODO: should call this prefix_quasi_exact_match
89
+ display_name: Prefix quasi-exact match
90
+ short_display_name: PEM
91
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
92
+ lower_is_better: false
93
+
94
+ - name: exact_match@5
95
+ display_name: Exact match @5
96
+ short_display_name: EM@5
97
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
98
+ lower_is_better: false
99
+ - name: quasi_exact_match@5
100
+ display_name: Quasi-exact match @5
101
+ short_display_name: EM@5
102
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
103
+ lower_is_better: false
104
+ - name: prefix_exact_match@5
105
+ display_name: Prefix exact match @5
106
+ short_display_name: PEM@5
107
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
108
+ lower_is_better: false
109
+ - name: quasi_prefix_exact_match@5
110
+ display_name: Prefix quasi-exact match @5
111
+ short_display_name: PEM@5
112
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
113
+ lower_is_better: false
114
+
115
+ - name: logprob
116
+ display_name: Log probability
117
+ short_display_name: Logprob
118
+ description: Predicted output's average log probability (input's log prob for language modeling).
119
+ lower_is_better: false
120
+ - name: logprob_per_byte
121
+ display_name: Log probability / byte
122
+ short_display_name: Logprob/byte
123
+ description: Predicted output's average log probability normalized by the number of bytes.
124
+ lower_is_better: false
125
+ - name: bits_per_byte
126
+ display_name: Bits/byte
127
+ short_display_name: BPB
128
+ lower_is_better: true
129
+ description: Average number of bits per byte according to model probabilities.
130
+ - name: perplexity
131
+ display_name: Perplexity
132
+ short_display_name: PPL
133
+ lower_is_better: true
134
+ description: Perplexity of the output completion (effective branching factor per output token).
135
+ - name: rouge_1
136
+ display_name: ROUGE-1
137
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
138
+ lower_is_better: false
139
+ - name: rouge_2
140
+ display_name: ROUGE-2
141
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
142
+ lower_is_better: false
143
+ - name: rouge_l
144
+ display_name: ROUGE-L
145
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
146
+ lower_is_better: false
147
+ - name: bleu_1
148
+ display_name: BLEU-1
149
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
150
+ lower_is_better: false
151
+ - name: bleu_4
152
+ display_name: BLEU-4
153
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
154
+ lower_is_better: false
155
+ - name: f1_set_match
156
+ display_name: F1 (set match)
157
+ short_display_name: F1
158
+ description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
159
+ lower_is_better: false
160
+ - name: f1_score
161
+ display_name: F1
162
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
163
+ lower_is_better: false
164
+ - name: classification_macro_f1
165
+ display_name: Macro-F1
166
+ description: Population-level macro-averaged F1 score.
167
+ lower_is_better: false
168
+ - name: classification_micro_f1
169
+ display_name: Micro-F1
170
+ description: Population-level micro-averaged F1 score.
171
+ lower_is_better: false
172
+ - name: absolute_value_difference
173
+ display_name: Absolute difference
174
+ short_display_name: Diff.
175
+ lower_is_better: true
176
+ description: Average absolute difference between the model output (converted to a number) and the correct reference.
177
+ - name: distance
178
+ display_name: Geometric distance
179
+ short_display_name: Dist.
180
+ lower_is_better: true
181
+ description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
182
+ - name: percent_valid
183
+ display_name: Valid fraction
184
+ short_display_name: Valid
185
+ description: Fraction of valid model outputs (as a number).
186
+ lower_is_better: false
187
+ - name: NDCG@10
188
+ display_name: NDCG@10
189
+ description: Normalized discounted cumulative gain at 10 in information retrieval.
190
+ lower_is_better: false
191
+ - name: RR@10
192
+ display_name: RR@10
193
+ description: Mean reciprocal rank at 10 in information retrieval.
194
+ lower_is_better: false
195
+ - name: NDCG@20
196
+ display_name: NDCG@20
197
+ description: Normalized discounted cumulative gain at 20 in information retrieval.
198
+ lower_is_better: false
199
+ - name: RR@20
200
+ display_name: RR@20
201
+ description: Mean reciprocal rank at 20 in information retrieval.
202
+ lower_is_better: false
203
+ - name: math_equiv
204
+ display_name: Equivalent
205
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference.
206
+ lower_is_better: false
207
+ - name: math_equiv_chain_of_thought
208
+ display_name: Equivalent (CoT)
209
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
210
+ lower_is_better: false
211
+ - name: exact_match_indicator
212
+ display_name: Exact match (final)
213
+ short_display_name: EM
214
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
215
+ lower_is_better: false
216
+ - name: final_number_exact_match
217
+ display_name: Exact match (final number)
218
+ short_display_name: EM
219
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
220
+ lower_is_better: false
221
+ - name: exact_set_match
222
+ display_name: Exact match (at sets)
223
+ short_display_name: EM
224
+ description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
225
+ lower_is_better: false
226
+ - name: iou_set_match
227
+ display_name: Intersection over union (as sets)
228
+ short_display_name: IoU
229
+ description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
230
+ lower_is_better: false
231
+
232
+ # Efficiency metrics:
233
+ - name: training_co2_cost
234
+ display_name: Estimated training emissions (kg CO2)
235
+ short_display_name: Training emissions (kg CO2)
236
+ lower_is_better: true
237
+ description: Estimate of the CO2 emissions from training the model.
238
+ - name: training_energy_cost
239
+ display_name: Estimated training energy cost (MWh)
240
+ short_display_name: Training energy (MWh)
241
+ lower_is_better: true
242
+ description: Estimate of the amount of energy used to train the model.
243
+ - name: inference_runtime
244
+ display_name: Observed inference runtime (s)
245
+ short_display_name: Observed inference time (s)
246
+ lower_is_better: true
247
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
248
+ - name: inference_idealized_runtime
249
+ display_name: Idealized inference runtime (s)
250
+ short_display_name: Idealized inference time (s)
251
+ lower_is_better: true
252
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
253
+ - name: inference_denoised_runtime
254
+ display_name: Denoised inference runtime (s)
255
+ short_display_name: Denoised inference time (s)
256
+ lower_is_better: true
257
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
258
+ - name: batch_size
259
+ display_name: Batch size
260
+ description: For batch jobs, how many requests are in a batch.
261
+
262
+ # Calibration metrics:
263
+ - name: ece_1_bin
264
+ display_name: 1-bin expected calibration error
265
+ short_display_name: ECE (1-bin)
266
+ lower_is_better: true
267
+ description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
268
+ - name: max_prob
269
+ display_name: Max prob
270
+ description: Model's average confidence in its prediction (only computed for classification tasks)
271
+ lower_is_better: false
272
+ - name: ece_10_bin
273
+ display_name: 10-bin expected calibration error
274
+ short_display_name: ECE (10-bin)
275
+ lower_is_better: true
276
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
277
+ - name: platt_ece_1_bin
278
+ display_name: 1-bin expected calibration error (after Platt scaling)
279
+ short_display_name: Platt-scaled ECE (1-bin)
280
+ lower_is_better: true
281
+ description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
282
+ - name: platt_ece_10_bin
283
+ display_name: 10-bin Expected Calibration Error (after Platt scaling)
284
+ short_display_name: Platt-scaled ECE (10-bin)
285
+ lower_is_better: true
286
+ description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
287
+ - name: platt_coef
288
+ display_name: Platt Scaling Coefficient
289
+ short_display_name: Platt Coef
290
+ description: Coefficient of the Platt scaling classifier (can compare this across tasks).
291
+ lower_is_better: false
292
+ - name: platt_intercept
293
+ display_name: Platt Scaling Intercept
294
+ short_display_name: Platt Intercept
295
+ description: Intercept of the Platt scaling classifier (can compare this across tasks).
296
+ lower_is_better: false
297
+ - name: selective_cov_acc_area
298
+ display_name: Selective coverage-accuracy area
299
+ short_display_name: Selective Acc
300
+ description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
301
+ lower_is_better: false
302
+ - name: selective_acc@10
303
+ display_name: Accuracy at 10% coverage
304
+ short_display_name: Acc@10%
305
+ description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
306
+ lower_is_better: false
307
+
308
+ ############################################################
309
+ perturbations: []
310
+ ############################################################
311
+ metric_groups:
312
+ - name: accuracy
313
+ display_name: Accuracy
314
+ hide_win_rates: true
315
+ metrics:
316
+ - name: ${main_name}
317
+ split: ${main_split}
318
+
319
+ - name: efficiency
320
+ display_name: Efficiency
321
+ metrics:
322
+ - name: inference_runtime
323
+ split: ${main_split}
324
+
325
+ - name: general_information
326
+ display_name: General information
327
+ hide_win_rates: true
328
+ metrics:
329
+ - name: num_instances
330
+ split: ${main_split}
331
+ - name: num_train_instances
332
+ split: ${main_split}
333
+ - name: prompt_truncated
334
+ split: ${main_split}
335
+ - name: num_prompt_tokens
336
+ split: ${main_split}
337
+ - name: num_output_tokens
338
+ split: ${main_split}
339
+
340
+ ############################################################
341
+ run_groups:
342
+ - name: mmlu_subjects
343
+ display_name: MMLU Subjects
344
+ short_display_name: MMLU Subjects
345
+ description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
346
+ category: All Scenarios
347
+ subgroups:
348
+ - mmlu
349
+ - mmlu_abstract_algebra
350
+ - mmlu_anatomy
351
+ - mmlu_college_chemistry
352
+ - mmlu_computer_security
353
+ - mmlu_econometrics
354
+ - mmlu_global_facts
355
+ - mmlu_jurisprudence
356
+ - mmlu_philosophy
357
+ - mmlu_professional_medicine
358
+ - mmlu_us_foreign_policy
359
+ - mmlu_astronomy
360
+ - mmlu_business_ethics
361
+ - mmlu_clinical_knowledge
362
+ - mmlu_college_biology
363
+ - mmlu_college_computer_science
364
+ - mmlu_college_mathematics
365
+ - mmlu_college_medicine
366
+ - mmlu_college_physics
367
+ - mmlu_conceptual_physics
368
+ - mmlu_electrical_engineering
369
+ - mmlu_elementary_mathematics
370
+ - mmlu_formal_logic
371
+ - mmlu_high_school_biology
372
+ - mmlu_high_school_chemistry
373
+ - mmlu_high_school_computer_science
374
+ - mmlu_high_school_european_history
375
+ - mmlu_high_school_geography
376
+ - mmlu_high_school_government_and_politics
377
+ - mmlu_high_school_macroeconomics
378
+ - mmlu_high_school_mathematics
379
+ - mmlu_high_school_microeconomics
380
+ - mmlu_high_school_physics
381
+ - mmlu_high_school_psychology
382
+ - mmlu_high_school_statistics
383
+ - mmlu_high_school_us_history
384
+ - mmlu_high_school_world_history
385
+ - mmlu_human_aging
386
+ - mmlu_human_sexuality
387
+ - mmlu_international_law
388
+ - mmlu_logical_fallacies
389
+ - mmlu_machine_learning
390
+ - mmlu_management
391
+ - mmlu_marketing
392
+ - mmlu_medical_genetics
393
+ - mmlu_miscellaneous
394
+ - mmlu_moral_disputes
395
+ - mmlu_moral_scenarios
396
+ - mmlu_nutrition
397
+ - mmlu_prehistory
398
+ - mmlu_professional_accounting
399
+ - mmlu_professional_law
400
+ - mmlu_professional_psychology
401
+ - mmlu_public_relations
402
+ - mmlu_security_studies
403
+ - mmlu_sociology
404
+ - mmlu_virology
405
+ - mmlu_world_religions
406
+
407
+ - name: mmlu
408
+ display_name: Massive Multitask Language Understanding (MMLU) All Subjects
409
+ short_display_name: MMLU All Subjects
410
+ description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
411
+ metric_groups:
412
+ - accuracy
413
+ - efficiency
414
+ - general_information
415
+ environment:
416
+ main_name: exact_match
417
+ main_split: test
418
+ taxonomy:
419
+ task: multiple-choice question answering
420
+ what: math, science, history, etc.
421
+ who: various online sources
422
+ when: before 2021
423
+ language: English
424
+
425
+ - name: mmlu_abstract_algebra
426
+ display_name: Abstract Algebra
427
+ short_display_name: Abstract Algebra
428
+ description: The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.
429
+ metric_groups:
430
+ - accuracy
431
+ - efficiency
432
+ - general_information
433
+ environment:
434
+ main_name: exact_match
435
+ main_split: test
436
+ taxonomy:
437
+ task: multiple-choice question answering
438
+ what: abstract algebra
439
+ who: various online sources
440
+ when: before 2021
441
+ language: English
442
+
443
+ - name: mmlu_anatomy
444
+ display_name: Anatomy
445
+ short_display_name: Anatomy
446
+ description: The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
447
+ metric_groups:
448
+ - accuracy
449
+ - efficiency
450
+ - general_information
451
+ environment:
452
+ main_name: exact_match
453
+ main_split: test
454
+ taxonomy:
455
+ task: multiple-choice question answering
456
+ what: anatomy
457
+ who: various online sources
458
+ when: before 2021
459
+ language: English
460
+
461
+ - name: mmlu_college_chemistry
462
+ display_name: College Chemistry
463
+ short_display_name: College Chemistry
464
+ description: The college chemistry subject in the Massive Multitask Language Understanding (MMLU) benchmark.
465
+ metric_groups:
466
+ - accuracy
467
+ - efficiency
468
+ - general_information
469
+ environment:
470
+ main_name: exact_match
471
+ main_split: test
472
+ taxonomy:
473
+ task: multiple-choice question answering
474
+ what: college chemistry
475
+ who: various online sources
476
+ when: before 2021
477
+ language: English
478
+
479
+ - name: mmlu_computer_security
480
+ display_name: Computer Security
481
+ short_display_name: Computer Security
482
+ description: The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.
483
+ metric_groups:
484
+ - accuracy
485
+ - efficiency
486
+ - general_information
487
+ environment:
488
+ main_name: exact_match
489
+ main_split: test
490
+ taxonomy:
491
+ task: multiple-choice question answering
492
+ what: computer security
493
+ who: various online sources
494
+ when: before 2021
495
+ language: English
496
+
497
+ - name: mmlu_econometrics
498
+ display_name: Econometrics
499
+ short_display_name: Econometrics
500
+ description: The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
501
+ metric_groups:
502
+ - accuracy
503
+ - efficiency
504
+ - general_information
505
+ environment:
506
+ main_name: exact_match
507
+ main_split: test
508
+ taxonomy:
509
+ task: multiple-choice question answering
510
+ what: econometrics
511
+ who: various online sources
512
+ when: before 2021
513
+ language: English
514
+
515
+ - name: mmlu_global_facts
516
+ display_name: Global Facts
517
+ short_display_name: Global Facts
518
+ description: The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.
519
+ metric_groups:
520
+ - accuracy
521
+ - efficiency
522
+ - general_information
523
+ environment:
524
+ main_name: exact_match
525
+ main_split: test
526
+ taxonomy:
527
+ task: multiple-choice question answering
528
+ what: global facts
529
+ who: various online sources
530
+ when: before 2021
531
+ language: English
532
+
533
+ - name: mmlu_jurisprudence
534
+ display_name: Jurisprudence
535
+ short_display_name: Jurisprudence
536
+ description: The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.
537
+ metric_groups:
538
+ - accuracy
539
+ - efficiency
540
+ - general_information
541
+ environment:
542
+ main_name: exact_match
543
+ main_split: test
544
+ taxonomy:
545
+ task: multiple-choice question answering
546
+ what: jurisprudence
547
+ who: various online sources
548
+ when: before 2021
549
+ language: English
550
+
551
+ - name: mmlu_philosophy
552
+ display_name: Philosophy
553
+ short_display_name: Philosophy
554
+ description: The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
555
+ metric_groups:
556
+ - accuracy
557
+ - efficiency
558
+ - general_information
559
+ environment:
560
+ main_name: exact_match
561
+ main_split: test
562
+ taxonomy:
563
+ task: multiple-choice question answering
564
+ what: philosophy
565
+ who: various online sources
566
+ when: before 2021
567
+ language: English
568
+
569
+ - name: mmlu_professional_medicine
570
+ display_name: Professional Medicine
571
+ short_display_name: Professional Medicine
572
+ description: The professional medicine subject in the Massive Multitask Language Understanding (MMLU) benchmark.
573
+ metric_groups:
574
+ - accuracy
575
+ - efficiency
576
+ - general_information
577
+ environment:
578
+ main_name: exact_match
579
+ main_split: test
580
+ taxonomy:
581
+ task: multiple-choice question answering
582
+ what: professional medicine
583
+ who: various online sources
584
+ when: before 2021
585
+ language: English
586
+
587
+ - name: mmlu_us_foreign_policy
588
+ display_name: Us Foreign Policy
589
+ short_display_name: Us Foreign Policy
590
+ description: The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
591
+ metric_groups:
592
+ - accuracy
593
+ - efficiency
594
+ - general_information
595
+ environment:
596
+ main_name: exact_match
597
+ main_split: test
598
+ taxonomy:
599
+ task: multiple-choice question answering
600
+ what: us foreign policy
601
+ who: various online sources
602
+ when: before 2021
603
+ language: English
604
+
605
+ - name: mmlu_astronomy
606
+ display_name: Astronomy
607
+ short_display_name: Astronomy
608
+ description: The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
609
+ metric_groups:
610
+ - accuracy
611
+ - efficiency
612
+ - general_information
613
+ environment:
614
+ main_name: exact_match
615
+ main_split: test
616
+ taxonomy:
617
+ task: multiple-choice question answering
618
+ what: astronomy
619
+ who: various online sources
620
+ when: before 2021
621
+ language: English
622
+
623
+ - name: mmlu_business_ethics
624
+ display_name: Business Ethics
625
+ short_display_name: Business Ethics
626
+ description: The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
627
+ metric_groups:
628
+ - accuracy
629
+ - efficiency
630
+ - general_information
631
+ environment:
632
+ main_name: exact_match
633
+ main_split: test
634
+ taxonomy:
635
+ task: multiple-choice question answering
636
+ what: business ethics
637
+ who: various online sources
638
+ when: before 2021
639
+ language: English
640
+
641
+ - name: mmlu_clinical_knowledge
642
+ display_name: Clinical Knowledge
643
+ short_display_name: Clinical Knowledge
644
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.
645
+ metric_groups:
646
+ - accuracy
647
+ - efficiency
648
+ - general_information
649
+ environment:
650
+ main_name: exact_match
651
+ main_split: test
652
+ taxonomy:
653
+ task: multiple-choice question answering
654
+ what: clinical knowledge
655
+ who: various online sources
656
+ when: before 2021
657
+ language: English
658
+
659
+ - name: mmlu_college_biology
660
+ display_name: College Biology
661
+ short_display_name: College Biology
662
+ description: The college biology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
663
+ metric_groups:
664
+ - accuracy
665
+ - efficiency
666
+ - general_information
667
+ environment:
668
+ main_name: exact_match
669
+ main_split: test
670
+ taxonomy:
671
+ task: multiple-choice question answering
672
+ what: college biology
673
+ who: various online sources
674
+ when: before 2021
675
+ language: English
676
+
677
+ - name: mmlu_college_computer_science
678
+ display_name: College Computer Science
679
+ short_display_name: College Computer Science
680
+ description: The college computer science subject in the Massive Multitask Language Understanding (MMLU) benchmark.
681
+ metric_groups:
682
+ - accuracy
683
+ - efficiency
684
+ - general_information
685
+ environment:
686
+ main_name: exact_match
687
+ main_split: test
688
+ taxonomy:
689
+ task: multiple-choice question answering
690
+ what: college computer science
691
+ who: various online sources
692
+ when: before 2021
693
+ language: English
694
+
695
+ - name: mmlu_college_mathematics
696
+ display_name: College Mathematics
697
+ short_display_name: College Mathematics
698
+ description: The college mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
699
+ metric_groups:
700
+ - accuracy
701
+ - efficiency
702
+ - general_information
703
+ environment:
704
+ main_name: exact_match
705
+ main_split: test
706
+ taxonomy:
707
+ task: multiple-choice question answering
708
+ what: college mathematics
709
+ who: various online sources
710
+ when: before 2021
711
+ language: English
712
+
713
+ - name: mmlu_college_medicine
714
+ display_name: College Medicine
715
+ short_display_name: College Medicine
716
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) benchmark.
717
+ metric_groups:
718
+ - accuracy
719
+ - efficiency
720
+ - general_information
721
+ environment:
722
+ main_name: exact_match
723
+ main_split: test
724
+ taxonomy:
725
+ task: multiple-choice question answering
726
+ what: college medicine
727
+ who: various online sources
728
+ when: before 2021
729
+ language: English
730
+
731
+ - name: mmlu_college_physics
732
+ display_name: College Physics
733
+ short_display_name: College Physics
734
+ description: The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
735
+ metric_groups:
736
+ - accuracy
737
+ - efficiency
738
+ - general_information
739
+ environment:
740
+ main_name: exact_match
741
+ main_split: test
742
+ taxonomy:
743
+ task: multiple-choice question answering
744
+ what: college physics
745
+ who: various online sources
746
+ when: before 2021
747
+ language: English
748
+
749
+ - name: mmlu_conceptual_physics
750
+ display_name: Conceptual Physics
751
+ short_display_name: Conceptual Physics
752
+ description: The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
753
+ metric_groups:
754
+ - accuracy
755
+ - efficiency
756
+ - general_information
757
+ environment:
758
+ main_name: exact_match
759
+ main_split: test
760
+ taxonomy:
761
+ task: multiple-choice question answering
762
+ what: conceptual physics
763
+ who: various online sources
764
+ when: before 2021
765
+ language: English
766
+
767
+ - name: mmlu_electrical_engineering
768
+ display_name: Electrical Engineering
769
+ short_display_name: Electrical Engineering
770
+ description: The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.
771
+ metric_groups:
772
+ - accuracy
773
+ - efficiency
774
+ - general_information
775
+ environment:
776
+ main_name: exact_match
777
+ main_split: test
778
+ taxonomy:
779
+ task: multiple-choice question answering
780
+ what: electrical engineering
781
+ who: various online sources
782
+ when: before 2021
783
+ language: English
784
+
785
+ - name: mmlu_elementary_mathematics
786
+ display_name: Elementary Mathematics
787
+ short_display_name: Elementary Mathematics
788
+ description: The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
789
+ metric_groups:
790
+ - accuracy
791
+ - efficiency
792
+ - general_information
793
+ environment:
794
+ main_name: exact_match
795
+ main_split: test
796
+ taxonomy:
797
+ task: multiple-choice question answering
798
+ what: elementary mathematics
799
+ who: various online sources
800
+ when: before 2021
801
+ language: English
802
+
803
+ - name: mmlu_formal_logic
804
+ display_name: Formal Logic
805
+ short_display_name: Formal Logic
806
+ description: The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.
807
+ metric_groups:
808
+ - accuracy
809
+ - efficiency
810
+ - general_information
811
+ environment:
812
+ main_name: exact_match
813
+ main_split: test
814
+ taxonomy:
815
+ task: multiple-choice question answering
816
+ what: formal logic
817
+ who: various online sources
818
+ when: before 2021
819
+ language: English
820
+
821
+ - name: mmlu_high_school_biology
822
+ display_name: High School Biology
823
+ short_display_name: High School Biology
824
+ description: The high school biology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
825
+ metric_groups:
826
+ - accuracy
827
+ - efficiency
828
+ - general_information
829
+ environment:
830
+ main_name: exact_match
831
+ main_split: test
832
+ taxonomy:
833
+ task: multiple-choice question answering
834
+ what: high school biology
835
+ who: various online sources
836
+ when: before 2021
837
+ language: English
838
+
839
+ - name: mmlu_high_school_chemistry
840
+ display_name: High School Chemistry
841
+ short_display_name: High School Chemistry
842
+ description: The high school chemistry subject in the Massive Multitask Language Understanding (MMLU) benchmark.
843
+ metric_groups:
844
+ - accuracy
845
+ - efficiency
846
+ - general_information
847
+ environment:
848
+ main_name: exact_match
849
+ main_split: test
850
+ taxonomy:
851
+ task: multiple-choice question answering
852
+ what: high school chemistry
853
+ who: various online sources
854
+ when: before 2021
855
+ language: English
856
+
857
+ - name: mmlu_high_school_computer_science
858
+ display_name: High School Computer Science
859
+ short_display_name: High School Computer Science
860
+ description: The high school computer science subject in the Massive Multitask Language Understanding (MMLU) benchmark.
861
+ metric_groups:
862
+ - accuracy
863
+ - efficiency
864
+ - general_information
865
+ environment:
866
+ main_name: exact_match
867
+ main_split: test
868
+ taxonomy:
869
+ task: multiple-choice question answering
870
+ what: high school computer science
871
+ who: various online sources
872
+ when: before 2021
873
+ language: English
874
+
875
+ - name: mmlu_high_school_european_history
876
+ display_name: High School European History
877
+ short_display_name: High School European History
878
+ description: The high school european history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
879
+ metric_groups:
880
+ - accuracy
881
+ - efficiency
882
+ - general_information
883
+ environment:
884
+ main_name: exact_match
885
+ main_split: test
886
+ taxonomy:
887
+ task: multiple-choice question answering
888
+ what: high school european history
889
+ who: various online sources
890
+ when: before 2021
891
+ language: English
892
+
893
+ - name: mmlu_high_school_geography
894
+ display_name: High School Geography
895
+ short_display_name: High School Geography
896
+ description: The high school geography subject in the Massive Multitask Language Understanding (MMLU) benchmark.
897
+ metric_groups:
898
+ - accuracy
899
+ - efficiency
900
+ - general_information
901
+ environment:
902
+ main_name: exact_match
903
+ main_split: test
904
+ taxonomy:
905
+ task: multiple-choice question answering
906
+ what: high school geography
907
+ who: various online sources
908
+ when: before 2021
909
+ language: English
910
+
911
+ - name: mmlu_high_school_government_and_politics
912
+ display_name: High School Government And Politics
913
+ short_display_name: High School Government And Politics
914
+ description: The high school government and politics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
915
+ metric_groups:
916
+ - accuracy
917
+ - efficiency
918
+ - general_information
919
+ environment:
920
+ main_name: exact_match
921
+ main_split: test
922
+ taxonomy:
923
+ task: multiple-choice question answering
924
+ what: high school government and politics
925
+ who: various online sources
926
+ when: before 2021
927
+ language: English
928
+
929
+ - name: mmlu_high_school_macroeconomics
930
+ display_name: High School Macroeconomics
931
+ short_display_name: High School Macroeconomics
932
+ description: The high school macroeconomics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
933
+ metric_groups:
934
+ - accuracy
935
+ - efficiency
936
+ - general_information
937
+ environment:
938
+ main_name: exact_match
939
+ main_split: test
940
+ taxonomy:
941
+ task: multiple-choice question answering
942
+ what: high school macroeconomics
943
+ who: various online sources
944
+ when: before 2021
945
+ language: English
946
+
947
+ - name: mmlu_high_school_mathematics
948
+ display_name: High School Mathematics
949
+ short_display_name: High School Mathematics
950
+ description: The high school mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
951
+ metric_groups:
952
+ - accuracy
953
+ - efficiency
954
+ - general_information
955
+ environment:
956
+ main_name: exact_match
957
+ main_split: test
958
+ taxonomy:
959
+ task: multiple-choice question answering
960
+ what: high school mathematics
961
+ who: various online sources
962
+ when: before 2021
963
+ language: English
964
+
965
+ - name: mmlu_high_school_microeconomics
966
+ display_name: High School Microeconomics
967
+ short_display_name: High School Microeconomics
968
+ description: The high school microeconomics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
969
+ metric_groups:
970
+ - accuracy
971
+ - efficiency
972
+ - general_information
973
+ environment:
974
+ main_name: exact_match
975
+ main_split: test
976
+ taxonomy:
977
+ task: multiple-choice question answering
978
+ what: high school microeconomics
979
+ who: various online sources
980
+ when: before 2021
981
+ language: English
982
+
983
+ - name: mmlu_high_school_physics
984
+ display_name: High School Physics
985
+ short_display_name: High School Physics
986
+ description: The high school physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
987
+ metric_groups:
988
+ - accuracy
989
+ - efficiency
990
+ - general_information
991
+ environment:
992
+ main_name: exact_match
993
+ main_split: test
994
+ taxonomy:
995
+ task: multiple-choice question answering
996
+ what: high school physics
997
+ who: various online sources
998
+ when: before 2021
999
+ language: English
1000
+
1001
+ - name: mmlu_high_school_psychology
1002
+ display_name: High School Psychology
1003
+ short_display_name: High School Psychology
1004
+ description: The high school psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1005
+ metric_groups:
1006
+ - accuracy
1007
+ - efficiency
1008
+ - general_information
1009
+ environment:
1010
+ main_name: exact_match
1011
+ main_split: test
1012
+ taxonomy:
1013
+ task: multiple-choice question answering
1014
+ what: high school psychology
1015
+ who: various online sources
1016
+ when: before 2021
1017
+ language: English
1018
+
1019
+ - name: mmlu_high_school_statistics
1020
+ display_name: High School Statistics
1021
+ short_display_name: High School Statistics
1022
+ description: The high school statistics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1023
+ metric_groups:
1024
+ - accuracy
1025
+ - efficiency
1026
+ - general_information
1027
+ environment:
1028
+ main_name: exact_match
1029
+ main_split: test
1030
+ taxonomy:
1031
+ task: multiple-choice question answering
1032
+ what: high school statistics
1033
+ who: various online sources
1034
+ when: before 2021
1035
+ language: English
1036
+
1037
+ - name: mmlu_high_school_us_history
1038
+ display_name: High School US History
1039
+ short_display_name: High School US History
1040
+ description: The high school us history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1041
+ metric_groups:
1042
+ - accuracy
1043
+ - efficiency
1044
+ - general_information
1045
+ environment:
1046
+ main_name: exact_match
1047
+ main_split: test
1048
+ taxonomy:
1049
+ task: multiple-choice question answering
1050
+ what: high school us history
1051
+ who: various online sources
1052
+ when: before 2021
1053
+ language: English
1054
+
1055
+ - name: mmlu_high_school_world_history
1056
+ display_name: High School World History
1057
+ short_display_name: High School World History
1058
+ description: The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1059
+ metric_groups:
1060
+ - accuracy
1061
+ - efficiency
1062
+ - general_information
1063
+ environment:
1064
+ main_name: exact_match
1065
+ main_split: test
1066
+ taxonomy:
1067
+ task: multiple-choice question answering
1068
+ what: high school world history
1069
+ who: various online sources
1070
+ when: before 2021
1071
+ language: English
1072
+
1073
+ - name: mmlu_human_aging
1074
+ display_name: Human Aging
1075
+ short_display_name: Human Aging
1076
+ description: The human aging subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1077
+ metric_groups:
1078
+ - accuracy
1079
+ - efficiency
1080
+ - general_information
1081
+ environment:
1082
+ main_name: exact_match
1083
+ main_split: test
1084
+ taxonomy:
1085
+ task: multiple-choice question answering
1086
+ what: human aging
1087
+ who: various online sources
1088
+ when: before 2021
1089
+ language: English
1090
+
1091
+ - name: mmlu_human_sexuality
1092
+ display_name: Human Sexuality
1093
+ short_display_name: Human Sexuality
1094
+ description: The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1095
+ metric_groups:
1096
+ - accuracy
1097
+ - efficiency
1098
+ - general_information
1099
+ environment:
1100
+ main_name: exact_match
1101
+ main_split: test
1102
+ taxonomy:
1103
+ task: multiple-choice question answering
1104
+ what: human sexuality
1105
+ who: various online sources
1106
+ when: before 2021
1107
+ language: English
1108
+
1109
+ - name: mmlu_international_law
1110
+ display_name: International Law
1111
+ short_display_name: International Law
1112
+ description: The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1113
+ metric_groups:
1114
+ - accuracy
1115
+ - efficiency
1116
+ - general_information
1117
+ environment:
1118
+ main_name: exact_match
1119
+ main_split: test
1120
+ taxonomy:
1121
+ task: multiple-choice question answering
1122
+ what: international law
1123
+ who: various online sources
1124
+ when: before 2021
1125
+ language: English
1126
+
1127
+ - name: mmlu_logical_fallacies
1128
+ display_name: Logical Fallacies
1129
+ short_display_name: Logical Fallacies
1130
+ description: The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1131
+ metric_groups:
1132
+ - accuracy
1133
+ - efficiency
1134
+ - general_information
1135
+ environment:
1136
+ main_name: exact_match
1137
+ main_split: test
1138
+ taxonomy:
1139
+ task: multiple-choice question answering
1140
+ what: logical fallacies
1141
+ who: various online sources
1142
+ when: before 2021
1143
+ language: English
1144
+
1145
+ - name: mmlu_machine_learning
1146
+ display_name: Machine Learning
1147
+ short_display_name: Machine Learning
1148
+ description: The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1149
+ metric_groups:
1150
+ - accuracy
1151
+ - efficiency
1152
+ - general_information
1153
+ environment:
1154
+ main_name: exact_match
1155
+ main_split: test
1156
+ taxonomy:
1157
+ task: multiple-choice question answering
1158
+ what: machine learning
1159
+ who: various online sources
1160
+ when: before 2021
1161
+ language: English
1162
+
1163
+ - name: mmlu_management
1164
+ display_name: Management
1165
+ short_display_name: Management
1166
+ description: The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1167
+ metric_groups:
1168
+ - accuracy
1169
+ - efficiency
1170
+ - general_information
1171
+ environment:
1172
+ main_name: exact_match
1173
+ main_split: test
1174
+ taxonomy:
1175
+ task: multiple-choice question answering
1176
+ what: management
1177
+ who: various online sources
1178
+ when: before 2021
1179
+ language: English
1180
+
1181
+ - name: mmlu_marketing
1182
+ display_name: Marketing
1183
+ short_display_name: Marketing
1184
+ description: The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1185
+ metric_groups:
1186
+ - accuracy
1187
+ - efficiency
1188
+ - general_information
1189
+ environment:
1190
+ main_name: exact_match
1191
+ main_split: test
1192
+ taxonomy:
1193
+ task: multiple-choice question answering
1194
+ what: marketing
1195
+ who: various online sources
1196
+ when: before 2021
1197
+ language: English
1198
+
1199
+ - name: mmlu_medical_genetics
1200
+ display_name: Medical Genetics
1201
+ short_display_name: Medical Genetics
1202
+ description: The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1203
+ metric_groups:
1204
+ - accuracy
1205
+ - efficiency
1206
+ - general_information
1207
+ environment:
1208
+ main_name: exact_match
1209
+ main_split: test
1210
+ taxonomy:
1211
+ task: multiple-choice question answering
1212
+ what: medical genetics
1213
+ who: various online sources
1214
+ when: before 2021
1215
+ language: English
1216
+
1217
+ - name: mmlu_miscellaneous
1218
+ display_name: Miscellaneous
1219
+ short_display_name: Miscellaneous
1220
+ description: The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1221
+ metric_groups:
1222
+ - accuracy
1223
+ - efficiency
1224
+ - general_information
1225
+ environment:
1226
+ main_name: exact_match
1227
+ main_split: test
1228
+ taxonomy:
1229
+ task: multiple-choice question answering
1230
+ what: miscellaneous
1231
+ who: various online sources
1232
+ when: before 2021
1233
+ language: English
1234
+
1235
+ - name: mmlu_moral_disputes
1236
+ display_name: Moral Disputes
1237
+ short_display_name: Moral Disputes
1238
+ description: The moral disputes subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1239
+ metric_groups:
1240
+ - accuracy
1241
+ - efficiency
1242
+ - general_information
1243
+ environment:
1244
+ main_name: exact_match
1245
+ main_split: test
1246
+ taxonomy:
1247
+ task: multiple-choice question answering
1248
+ what: moral disputes
1249
+ who: various online sources
1250
+ when: before 2021
1251
+ language: English
1252
+
1253
+ - name: mmlu_moral_scenarios
1254
+ display_name: Moral Scenarios
1255
+ short_display_name: Moral Scenarios
1256
+ description: The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1257
+ metric_groups:
1258
+ - accuracy
1259
+ - efficiency
1260
+ - general_information
1261
+ environment:
1262
+ main_name: exact_match
1263
+ main_split: test
1264
+ taxonomy:
1265
+ task: multiple-choice question answering
1266
+ what: moral scenarios
1267
+ who: various online sources
1268
+ when: before 2021
1269
+ language: English
1270
+
1271
+ - name: mmlu_nutrition
1272
+ display_name: Nutrition
1273
+ short_display_name: Nutrition
1274
+ description: The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1275
+ metric_groups:
1276
+ - accuracy
1277
+ - efficiency
1278
+ - general_information
1279
+ environment:
1280
+ main_name: exact_match
1281
+ main_split: test
1282
+ taxonomy:
1283
+ task: multiple-choice question answering
1284
+ what: nutrition
1285
+ who: various online sources
1286
+ when: before 2021
1287
+ language: English
1288
+
1289
+ - name: mmlu_prehistory
1290
+ display_name: Prehistory
1291
+ short_display_name: Prehistory
1292
+ description: The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1293
+ metric_groups:
1294
+ - accuracy
1295
+ - efficiency
1296
+ - general_information
1297
+ environment:
1298
+ main_name: exact_match
1299
+ main_split: test
1300
+ taxonomy:
1301
+ task: multiple-choice question answering
1302
+ what: prehistory
1303
+ who: various online sources
1304
+ when: before 2021
1305
+ language: English
1306
+
1307
+ - name: mmlu_professional_accounting
1308
+ display_name: Professional Accounting
1309
+ short_display_name: Professional Accounting
1310
+ description: The professional accounting subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1311
+ metric_groups:
1312
+ - accuracy
1313
+ - efficiency
1314
+ - general_information
1315
+ environment:
1316
+ main_name: exact_match
1317
+ main_split: test
1318
+ taxonomy:
1319
+ task: multiple-choice question answering
1320
+ what: professional accounting
1321
+ who: various online sources
1322
+ when: before 2021
1323
+ language: English
1324
+
1325
+ - name: mmlu_professional_law
1326
+ display_name: Professional Law
1327
+ short_display_name: Professional Law
1328
+ description: The professional law subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1329
+ metric_groups:
1330
+ - accuracy
1331
+ - efficiency
1332
+ - general_information
1333
+ environment:
1334
+ main_name: exact_match
1335
+ main_split: test
1336
+ taxonomy:
1337
+ task: multiple-choice question answering
1338
+ what: professional law
1339
+ who: various online sources
1340
+ when: before 2021
1341
+ language: English
1342
+
1343
+ - name: mmlu_professional_psychology
1344
+ display_name: Professional Psychology
1345
+ short_display_name: Professional Psychology
1346
+ description: The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1347
+ metric_groups:
1348
+ - accuracy
1349
+ - efficiency
1350
+ - general_information
1351
+ environment:
1352
+ main_name: exact_match
1353
+ main_split: test
1354
+ taxonomy:
1355
+ task: multiple-choice question answering
1356
+ what: professional psychology
1357
+ who: various online sources
1358
+ when: before 2021
1359
+ language: English
1360
+
1361
+ - name: mmlu_public_relations
1362
+ display_name: Public Relations
1363
+ short_display_name: Public Relations
1364
+ description: The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1365
+ metric_groups:
1366
+ - accuracy
1367
+ - efficiency
1368
+ - general_information
1369
+ environment:
1370
+ main_name: exact_match
1371
+ main_split: test
1372
+ taxonomy:
1373
+ task: multiple-choice question answering
1374
+ what: public relations
1375
+ who: various online sources
1376
+ when: before 2021
1377
+ language: English
1378
+
1379
+ - name: mmlu_security_studies
1380
+ display_name: Security Studies
1381
+ short_display_name: Security Studies
1382
+ description: The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1383
+ metric_groups:
1384
+ - accuracy
1385
+ - efficiency
1386
+ - general_information
1387
+ environment:
1388
+ main_name: exact_match
1389
+ main_split: test
1390
+ taxonomy:
1391
+ task: multiple-choice question answering
1392
+ what: security studies
1393
+ who: various online sources
1394
+ when: before 2021
1395
+ language: English
1396
+
1397
+ - name: mmlu_sociology
1398
+ display_name: Sociology
1399
+ short_display_name: Sociology
1400
+ description: The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1401
+ metric_groups:
1402
+ - accuracy
1403
+ - efficiency
1404
+ - general_information
1405
+ environment:
1406
+ main_name: exact_match
1407
+ main_split: test
1408
+ taxonomy:
1409
+ task: multiple-choice question answering
1410
+ what: sociology
1411
+ who: various online sources
1412
+ when: before 2021
1413
+ language: English
1414
+
1415
+ - name: mmlu_virology
1416
+ display_name: Virology
1417
+ short_display_name: Virology
1418
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1419
+ metric_groups:
1420
+ - accuracy
1421
+ - efficiency
1422
+ - general_information
1423
+ environment:
1424
+ main_name: exact_match
1425
+ main_split: test
1426
+ taxonomy:
1427
+ task: multiple-choice question answering
1428
+ what: virology
1429
+ who: various online sources
1430
+ when: before 2021
1431
+ language: English
1432
+
1433
+ - name: mmlu_world_religions
1434
+ display_name: World Religions
1435
+ short_display_name: World Religions
1436
+ description: The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1437
+ metric_groups:
1438
+ - accuracy
1439
+ - efficiency
1440
+ - general_information
1441
+ environment:
1442
+ main_name: exact_match
1443
+ main_split: test
1444
+ taxonomy:
1445
+ task: multiple-choice question answering
1446
+ what: world religions
1447
+ who: various online sources
1448
+ when: before 2021
1449
+ language: English