crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,1507 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: multiple_choice_joint
10
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
+ - name: multiple_choice_separate_original
12
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
+ - name: multiple_choice_separate_calibrated
14
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
+ - name: language_modeling
16
+ description: Given the input, the model assigns the sequence a probability.
17
+ - name: instructions
18
+ description: The description of the task that is included at the very beginning of the prompt.
19
+ - name: global_prefix
20
+ description: The string that is prepended to the prompt.
21
+ - name: global_suffix
22
+ description: The string that is appended to the prompt.
23
+ - name: instance_prefix
24
+ description: The string that is included before each instance (e.g., '\n\n').
25
+ - name: input_prefix
26
+ description: The string that is included before each input (e.g., 'Question:').
27
+ - name: input_suffix
28
+ description: The string that is included after each input (e.g., '\n').
29
+ - name: reference_prefix
30
+ description: The string that is included before each reference (for multiple-choice questions).
31
+ - name: reference_suffix
32
+ description: The string that is included after each reference (for multiple-choice questions).
33
+ - name: output_prefix
34
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
+ - name: output_suffix
36
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
+ - name: substitutions
38
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
+ - name: max_train_instances
40
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
+ - name: max_eval_instances
42
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
+ - name: num_outputs
44
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
+ - name: num_train_trials
46
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
+ - name: sample_train
48
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
+ - name: model
50
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
+ - name: model_deployment
52
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
+ - name: temperature
54
+ description: Temperature parameter used in generation.
55
+ - name: max_tokens
56
+ description: Maximum number of tokens to generate.
57
+ - name: stop_sequences
58
+ description: List of sequences, where we stop generation if we encounter any of them.
59
+ - name: random
60
+ description: Random seed (string), which guarantees reproducibility.
61
+ - name: multi_label
62
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
+
64
+ ############################################################
65
+ metrics:
66
+ # Infrastructure metrics:
67
+ - name: num_perplexity_tokens
68
+ display_name: '# tokens'
69
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
70
+ - name: num_bytes
71
+ display_name: '# bytes'
72
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
73
+
74
+ - name: num_references
75
+ display_name: '# ref'
76
+ description: Number of references.
77
+ - name: num_train_trials
78
+ display_name: '# trials'
79
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
80
+ - name: estimated_num_tokens_cost
81
+ display_name: 'cost'
82
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
83
+ - name: num_prompt_tokens
84
+ display_name: '# prompt tokens'
85
+ description: Number of tokens in the prompt.
86
+ - name: num_prompt_characters
87
+ display_name: '# prompt chars'
88
+ description: Number of characters in the prompt.
89
+ - name: num_completion_tokens
90
+ display_name: '# completion tokens'
91
+ description: Actual number of completion tokens (over all completions).
92
+ - name: num_output_tokens
93
+ display_name: '# output tokens'
94
+ description: Actual number of output tokens.
95
+ - name: max_num_output_tokens
96
+ display_name: 'Max output tokens'
97
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
98
+ - name: num_requests
99
+ display_name: '# requests'
100
+ description: Number of distinct API requests.
101
+ - name: num_instances
102
+ display_name: '# eval'
103
+ description: Number of evaluation instances.
104
+ - name: num_train_instances
105
+ display_name: '# train'
106
+ description: Number of training instances (e.g., in-context examples).
107
+ - name: prompt_truncated
108
+ display_name: truncated
109
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
110
+ - name: finish_reason_length
111
+ display_name: finish b/c length
112
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
113
+ - name: finish_reason_stop
114
+ display_name: finish b/c stop
115
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
116
+ - name: finish_reason_endoftext
117
+ display_name: finish b/c endoftext
118
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
119
+ - name: finish_reason_unknown
120
+ display_name: finish b/c unknown
121
+ description: Fraction of instances where the the output was terminated for unknown reasons.
122
+ - name: num_completions
123
+ display_name: '# completions'
124
+ description: Number of completions.
125
+ - name: predicted_index
126
+ display_name: Predicted index
127
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
128
+
129
+ # Accuracy metrics:
130
+ - name: exact_match
131
+ display_name: Exact match
132
+ short_display_name: EM
133
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
134
+ lower_is_better: false
135
+ - name: quasi_exact_match
136
+ display_name: Quasi-exact match
137
+ short_display_name: EM
138
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
139
+ lower_is_better: false
140
+ - name: prefix_exact_match
141
+ display_name: Prefix exact match
142
+ short_display_name: PEM
143
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
144
+ lower_is_better: false
145
+ - name: quasi_prefix_exact_match
146
+ # TODO: should call this prefix_quasi_exact_match
147
+ display_name: Prefix quasi-exact match
148
+ short_display_name: PEM
149
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
150
+ lower_is_better: false
151
+
152
+ - name: exact_match@5
153
+ display_name: Exact match @5
154
+ short_display_name: EM@5
155
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
156
+ lower_is_better: false
157
+ - name: quasi_exact_match@5
158
+ display_name: Quasi-exact match @5
159
+ short_display_name: EM@5
160
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
161
+ lower_is_better: false
162
+ - name: prefix_exact_match@5
163
+ display_name: Prefix exact match @5
164
+ short_display_name: PEM@5
165
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
166
+ lower_is_better: false
167
+ - name: quasi_prefix_exact_match@5
168
+ display_name: Prefix quasi-exact match @5
169
+ short_display_name: PEM@5
170
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
171
+ lower_is_better: false
172
+
173
+ - name: logprob
174
+ display_name: Log probability
175
+ short_display_name: Logprob
176
+ description: Predicted output's average log probability (input's log prob for language modeling).
177
+ lower_is_better: false
178
+ - name: logprob_per_byte
179
+ display_name: Log probability / byte
180
+ short_display_name: Logprob/byte
181
+ description: Predicted output's average log probability normalized by the number of bytes.
182
+ lower_is_better: false
183
+ - name: bits_per_byte
184
+ display_name: Bits/byte
185
+ short_display_name: BPB
186
+ lower_is_better: true
187
+ description: Average number of bits per byte according to model probabilities.
188
+ - name: perplexity
189
+ display_name: Perplexity
190
+ short_display_name: PPL
191
+ lower_is_better: true
192
+ description: Perplexity of the output completion (effective branching factor per output token).
193
+ - name: rouge_1
194
+ display_name: ROUGE-1
195
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
196
+ lower_is_better: false
197
+ - name: rouge_2
198
+ display_name: ROUGE-2
199
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
200
+ lower_is_better: false
201
+ - name: rouge_l
202
+ display_name: ROUGE-L
203
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
204
+ lower_is_better: false
205
+ - name: bleu_1
206
+ display_name: BLEU-1
207
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
208
+ lower_is_better: false
209
+ - name: bleu_4
210
+ display_name: BLEU-4
211
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
212
+ lower_is_better: false
213
+ - name: f1_set_match
214
+ display_name: F1 (set match)
215
+ short_display_name: F1
216
+ description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
217
+ lower_is_better: false
218
+ - name: f1_score
219
+ display_name: F1
220
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
221
+ lower_is_better: false
222
+ - name: classification_macro_f1
223
+ display_name: Macro-F1
224
+ description: Population-level macro-averaged F1 score.
225
+ lower_is_better: false
226
+ - name: classification_micro_f1
227
+ display_name: Micro-F1
228
+ description: Population-level micro-averaged F1 score.
229
+ lower_is_better: false
230
+ - name: absolute_value_difference
231
+ display_name: Absolute difference
232
+ short_display_name: Diff.
233
+ lower_is_better: true
234
+ description: Average absolute difference between the model output (converted to a number) and the correct reference.
235
+ - name: distance
236
+ display_name: Geometric distance
237
+ short_display_name: Dist.
238
+ lower_is_better: true
239
+ description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
240
+ - name: percent_valid
241
+ display_name: Valid fraction
242
+ short_display_name: Valid
243
+ description: Fraction of valid model outputs (as a number).
244
+ lower_is_better: false
245
+ - name: NDCG@10
246
+ display_name: NDCG@10
247
+ description: Normalized discounted cumulative gain at 10 in information retrieval.
248
+ lower_is_better: false
249
+ - name: RR@10
250
+ display_name: RR@10
251
+ description: Mean reciprocal rank at 10 in information retrieval.
252
+ lower_is_better: false
253
+ - name: NDCG@20
254
+ display_name: NDCG@20
255
+ description: Normalized discounted cumulative gain at 20 in information retrieval.
256
+ lower_is_better: false
257
+ - name: RR@20
258
+ display_name: RR@20
259
+ description: Mean reciprocal rank at 20 in information retrieval.
260
+ lower_is_better: false
261
+ - name: math_equiv
262
+ display_name: Equivalent
263
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference.
264
+ lower_is_better: false
265
+ - name: math_equiv_chain_of_thought
266
+ display_name: Equivalent (CoT)
267
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
268
+ lower_is_better: false
269
+ - name: exact_match_indicator
270
+ display_name: Exact match (final)
271
+ short_display_name: EM
272
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
273
+ lower_is_better: false
274
+ - name: final_number_exact_match
275
+ display_name: Exact match (final number)
276
+ short_display_name: EM
277
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
278
+ lower_is_better: false
279
+ - name: exact_set_match
280
+ display_name: Exact match (at sets)
281
+ short_display_name: EM
282
+ description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
283
+ lower_is_better: false
284
+ - name: iou_set_match
285
+ display_name: Intersection over union (as sets)
286
+ short_display_name: IoU
287
+ description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
288
+ lower_is_better: false
289
+
290
+ # Efficiency metrics:
291
+ - name: training_co2_cost
292
+ display_name: Estimated training emissions (kg CO2)
293
+ short_display_name: Training emissions (kg CO2)
294
+ lower_is_better: true
295
+ description: Estimate of the CO2 emissions from training the model.
296
+ - name: training_energy_cost
297
+ display_name: Estimated training energy cost (MWh)
298
+ short_display_name: Training energy (MWh)
299
+ lower_is_better: true
300
+ description: Estimate of the amount of energy used to train the model.
301
+ - name: inference_runtime
302
+ display_name: Observed inference runtime (s)
303
+ short_display_name: Observed inference time (s)
304
+ lower_is_better: true
305
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
306
+ - name: inference_idealized_runtime
307
+ display_name: Idealized inference runtime (s)
308
+ short_display_name: Idealized inference time (s)
309
+ lower_is_better: true
310
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
311
+ - name: inference_denoised_runtime
312
+ display_name: Denoised inference runtime (s)
313
+ short_display_name: Denoised inference time (s)
314
+ lower_is_better: true
315
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
316
+ - name: batch_size
317
+ display_name: Batch size
318
+ description: For batch jobs, how many requests are in a batch.
319
+
320
+ # Calibration metrics:
321
+ - name: ece_1_bin
322
+ display_name: 1-bin expected calibration error
323
+ short_display_name: ECE (1-bin)
324
+ lower_is_better: true
325
+ description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
326
+ - name: max_prob
327
+ display_name: Max prob
328
+ description: Model's average confidence in its prediction (only computed for classification tasks)
329
+ lower_is_better: false
330
+ - name: ece_10_bin
331
+ display_name: 10-bin expected calibration error
332
+ short_display_name: ECE (10-bin)
333
+ lower_is_better: true
334
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
335
+ - name: platt_ece_1_bin
336
+ display_name: 1-bin expected calibration error (after Platt scaling)
337
+ short_display_name: Platt-scaled ECE (1-bin)
338
+ lower_is_better: true
339
+ description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
340
+ - name: platt_ece_10_bin
341
+ display_name: 10-bin Expected Calibration Error (after Platt scaling)
342
+ short_display_name: Platt-scaled ECE (10-bin)
343
+ lower_is_better: true
344
+ description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
345
+ - name: platt_coef
346
+ display_name: Platt Scaling Coefficient
347
+ short_display_name: Platt Coef
348
+ description: Coefficient of the Platt scaling classifier (can compare this across tasks).
349
+ lower_is_better: false
350
+ - name: platt_intercept
351
+ display_name: Platt Scaling Intercept
352
+ short_display_name: Platt Intercept
353
+ description: Intercept of the Platt scaling classifier (can compare this across tasks).
354
+ lower_is_better: false
355
+ - name: selective_cov_acc_area
356
+ display_name: Selective coverage-accuracy area
357
+ short_display_name: Selective Acc
358
+ description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
359
+ lower_is_better: false
360
+ - name: selective_acc@10
361
+ display_name: Accuracy at 10% coverage
362
+ short_display_name: Acc@10%
363
+ description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
364
+ lower_is_better: false
365
+
366
+ ############################################################
367
+ perturbations: []
368
+ ############################################################
369
+ metric_groups:
370
+ - name: accuracy
371
+ display_name: Accuracy
372
+ hide_win_rates: true
373
+ metrics:
374
+ - name: ${main_name}
375
+ split: ${main_split}
376
+
377
+ - name: efficiency
378
+ display_name: Efficiency
379
+ metrics:
380
+ - name: inference_runtime
381
+ split: ${main_split}
382
+
383
+ - name: general_information
384
+ display_name: General information
385
+ hide_win_rates: true
386
+ metrics:
387
+ - name: num_instances
388
+ split: ${main_split}
389
+ - name: num_train_instances
390
+ split: ${main_split}
391
+ - name: prompt_truncated
392
+ split: ${main_split}
393
+ - name: num_prompt_tokens
394
+ split: ${main_split}
395
+ - name: num_output_tokens
396
+ split: ${main_split}
397
+
398
+ ############################################################
399
+ run_groups:
400
+ - name: mmlu_subjects
401
+ display_name: MMLU Subjects
402
+ short_display_name: MMLU Subjects
403
+ description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
404
+ category: All Scenarios
405
+ subgroups:
406
+ - mmlu
407
+ - mmlu_abstract_algebra
408
+ - mmlu_anatomy
409
+ - mmlu_college_chemistry
410
+ - mmlu_computer_security
411
+ - mmlu_econometrics
412
+ - mmlu_global_facts
413
+ - mmlu_jurisprudence
414
+ - mmlu_philosophy
415
+ - mmlu_professional_medicine
416
+ - mmlu_us_foreign_policy
417
+ - mmlu_astronomy
418
+ - mmlu_business_ethics
419
+ - mmlu_clinical_knowledge
420
+ - mmlu_college_biology
421
+ - mmlu_college_computer_science
422
+ - mmlu_college_mathematics
423
+ - mmlu_college_medicine
424
+ - mmlu_college_physics
425
+ - mmlu_conceptual_physics
426
+ - mmlu_electrical_engineering
427
+ - mmlu_elementary_mathematics
428
+ - mmlu_formal_logic
429
+ - mmlu_high_school_biology
430
+ - mmlu_high_school_chemistry
431
+ - mmlu_high_school_computer_science
432
+ - mmlu_high_school_european_history
433
+ - mmlu_high_school_geography
434
+ - mmlu_high_school_government_and_politics
435
+ - mmlu_high_school_macroeconomics
436
+ - mmlu_high_school_mathematics
437
+ - mmlu_high_school_microeconomics
438
+ - mmlu_high_school_physics
439
+ - mmlu_high_school_psychology
440
+ - mmlu_high_school_statistics
441
+ - mmlu_high_school_us_history
442
+ - mmlu_high_school_world_history
443
+ - mmlu_human_aging
444
+ - mmlu_human_sexuality
445
+ - mmlu_international_law
446
+ - mmlu_logical_fallacies
447
+ - mmlu_machine_learning
448
+ - mmlu_management
449
+ - mmlu_marketing
450
+ - mmlu_medical_genetics
451
+ - mmlu_miscellaneous
452
+ - mmlu_moral_disputes
453
+ - mmlu_moral_scenarios
454
+ - mmlu_nutrition
455
+ - mmlu_prehistory
456
+ - mmlu_professional_accounting
457
+ - mmlu_professional_law
458
+ - mmlu_professional_psychology
459
+ - mmlu_public_relations
460
+ - mmlu_security_studies
461
+ - mmlu_sociology
462
+ - mmlu_virology
463
+ - mmlu_world_religions
464
+
465
+ - name: mmlu
466
+ display_name: Massive Multitask Language Understanding (MMLU) All Subjects
467
+ short_display_name: MMLU All Subjects
468
+ description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
469
+ metric_groups:
470
+ - accuracy
471
+ - efficiency
472
+ - general_information
473
+ environment:
474
+ main_name: exact_match
475
+ main_split: test
476
+ taxonomy:
477
+ task: multiple-choice question answering
478
+ what: math, science, history, etc.
479
+ who: various online sources
480
+ when: before 2021
481
+ language: English
482
+
483
+ - name: mmlu_abstract_algebra
484
+ display_name: Abstract Algebra
485
+ short_display_name: Abstract Algebra
486
+ description: The abstract algebra subject in the Massive Multitask Language Understanding (MMLU) benchmark.
487
+ metric_groups:
488
+ - accuracy
489
+ - efficiency
490
+ - general_information
491
+ environment:
492
+ main_name: exact_match
493
+ main_split: test
494
+ taxonomy:
495
+ task: multiple-choice question answering
496
+ what: abstract algebra
497
+ who: various online sources
498
+ when: before 2021
499
+ language: English
500
+
501
+ - name: mmlu_anatomy
502
+ display_name: Anatomy
503
+ short_display_name: Anatomy
504
+ description: The anatomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
505
+ metric_groups:
506
+ - accuracy
507
+ - efficiency
508
+ - general_information
509
+ environment:
510
+ main_name: exact_match
511
+ main_split: test
512
+ taxonomy:
513
+ task: multiple-choice question answering
514
+ what: anatomy
515
+ who: various online sources
516
+ when: before 2021
517
+ language: English
518
+
519
+ - name: mmlu_college_chemistry
520
+ display_name: College Chemistry
521
+ short_display_name: College Chemistry
522
+ description: The college chemistry subject in the Massive Multitask Language Understanding (MMLU) benchmark.
523
+ metric_groups:
524
+ - accuracy
525
+ - efficiency
526
+ - general_information
527
+ environment:
528
+ main_name: exact_match
529
+ main_split: test
530
+ taxonomy:
531
+ task: multiple-choice question answering
532
+ what: college chemistry
533
+ who: various online sources
534
+ when: before 2021
535
+ language: English
536
+
537
+ - name: mmlu_computer_security
538
+ display_name: Computer Security
539
+ short_display_name: Computer Security
540
+ description: The computer security subject in the Massive Multitask Language Understanding (MMLU) benchmark.
541
+ metric_groups:
542
+ - accuracy
543
+ - efficiency
544
+ - general_information
545
+ environment:
546
+ main_name: exact_match
547
+ main_split: test
548
+ taxonomy:
549
+ task: multiple-choice question answering
550
+ what: computer security
551
+ who: various online sources
552
+ when: before 2021
553
+ language: English
554
+
555
+ - name: mmlu_econometrics
556
+ display_name: Econometrics
557
+ short_display_name: Econometrics
558
+ description: The econometrics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
559
+ metric_groups:
560
+ - accuracy
561
+ - efficiency
562
+ - general_information
563
+ environment:
564
+ main_name: exact_match
565
+ main_split: test
566
+ taxonomy:
567
+ task: multiple-choice question answering
568
+ what: econometrics
569
+ who: various online sources
570
+ when: before 2021
571
+ language: English
572
+
573
+ - name: mmlu_global_facts
574
+ display_name: Global Facts
575
+ short_display_name: Global Facts
576
+ description: The global facts subject in the Massive Multitask Language Understanding (MMLU) benchmark.
577
+ metric_groups:
578
+ - accuracy
579
+ - efficiency
580
+ - general_information
581
+ environment:
582
+ main_name: exact_match
583
+ main_split: test
584
+ taxonomy:
585
+ task: multiple-choice question answering
586
+ what: global facts
587
+ who: various online sources
588
+ when: before 2021
589
+ language: English
590
+
591
+ - name: mmlu_jurisprudence
592
+ display_name: Jurisprudence
593
+ short_display_name: Jurisprudence
594
+ description: The jurisprudence subject in the Massive Multitask Language Understanding (MMLU) benchmark.
595
+ metric_groups:
596
+ - accuracy
597
+ - efficiency
598
+ - general_information
599
+ environment:
600
+ main_name: exact_match
601
+ main_split: test
602
+ taxonomy:
603
+ task: multiple-choice question answering
604
+ what: jurisprudence
605
+ who: various online sources
606
+ when: before 2021
607
+ language: English
608
+
609
+ - name: mmlu_philosophy
610
+ display_name: Philosophy
611
+ short_display_name: Philosophy
612
+ description: The philosophy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
613
+ metric_groups:
614
+ - accuracy
615
+ - efficiency
616
+ - general_information
617
+ environment:
618
+ main_name: exact_match
619
+ main_split: test
620
+ taxonomy:
621
+ task: multiple-choice question answering
622
+ what: philosophy
623
+ who: various online sources
624
+ when: before 2021
625
+ language: English
626
+
627
+ - name: mmlu_professional_medicine
628
+ display_name: Professional Medicine
629
+ short_display_name: Professional Medicine
630
+ description: The professional medicine subject in the Massive Multitask Language Understanding (MMLU) benchmark.
631
+ metric_groups:
632
+ - accuracy
633
+ - efficiency
634
+ - general_information
635
+ environment:
636
+ main_name: exact_match
637
+ main_split: test
638
+ taxonomy:
639
+ task: multiple-choice question answering
640
+ what: professional medicine
641
+ who: various online sources
642
+ when: before 2021
643
+ language: English
644
+
645
+ - name: mmlu_us_foreign_policy
646
+ display_name: Us Foreign Policy
647
+ short_display_name: Us Foreign Policy
648
+ description: The us foreign policy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
649
+ metric_groups:
650
+ - accuracy
651
+ - efficiency
652
+ - general_information
653
+ environment:
654
+ main_name: exact_match
655
+ main_split: test
656
+ taxonomy:
657
+ task: multiple-choice question answering
658
+ what: us foreign policy
659
+ who: various online sources
660
+ when: before 2021
661
+ language: English
662
+
663
+ - name: mmlu_astronomy
664
+ display_name: Astronomy
665
+ short_display_name: Astronomy
666
+ description: The astronomy subject in the Massive Multitask Language Understanding (MMLU) benchmark.
667
+ metric_groups:
668
+ - accuracy
669
+ - efficiency
670
+ - general_information
671
+ environment:
672
+ main_name: exact_match
673
+ main_split: test
674
+ taxonomy:
675
+ task: multiple-choice question answering
676
+ what: astronomy
677
+ who: various online sources
678
+ when: before 2021
679
+ language: English
680
+
681
+ - name: mmlu_business_ethics
682
+ display_name: Business Ethics
683
+ short_display_name: Business Ethics
684
+ description: The business ethics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
685
+ metric_groups:
686
+ - accuracy
687
+ - efficiency
688
+ - general_information
689
+ environment:
690
+ main_name: exact_match
691
+ main_split: test
692
+ taxonomy:
693
+ task: multiple-choice question answering
694
+ what: business ethics
695
+ who: various online sources
696
+ when: before 2021
697
+ language: English
698
+
699
+ - name: mmlu_clinical_knowledge
700
+ display_name: Clinical Knowledge
701
+ short_display_name: Clinical Knowledge
702
+ description: The clinical knowledge subject in the Massive Multitask Language Understanding (MMLU) benchmark.
703
+ metric_groups:
704
+ - accuracy
705
+ - efficiency
706
+ - general_information
707
+ environment:
708
+ main_name: exact_match
709
+ main_split: test
710
+ taxonomy:
711
+ task: multiple-choice question answering
712
+ what: clinical knowledge
713
+ who: various online sources
714
+ when: before 2021
715
+ language: English
716
+
717
+ - name: mmlu_college_biology
718
+ display_name: College Biology
719
+ short_display_name: College Biology
720
+ description: The college biology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
721
+ metric_groups:
722
+ - accuracy
723
+ - efficiency
724
+ - general_information
725
+ environment:
726
+ main_name: exact_match
727
+ main_split: test
728
+ taxonomy:
729
+ task: multiple-choice question answering
730
+ what: college biology
731
+ who: various online sources
732
+ when: before 2021
733
+ language: English
734
+
735
+ - name: mmlu_college_computer_science
736
+ display_name: College Computer Science
737
+ short_display_name: College Computer Science
738
+ description: The college computer science subject in the Massive Multitask Language Understanding (MMLU) benchmark.
739
+ metric_groups:
740
+ - accuracy
741
+ - efficiency
742
+ - general_information
743
+ environment:
744
+ main_name: exact_match
745
+ main_split: test
746
+ taxonomy:
747
+ task: multiple-choice question answering
748
+ what: college computer science
749
+ who: various online sources
750
+ when: before 2021
751
+ language: English
752
+
753
+ - name: mmlu_college_mathematics
754
+ display_name: College Mathematics
755
+ short_display_name: College Mathematics
756
+ description: The college mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
757
+ metric_groups:
758
+ - accuracy
759
+ - efficiency
760
+ - general_information
761
+ environment:
762
+ main_name: exact_match
763
+ main_split: test
764
+ taxonomy:
765
+ task: multiple-choice question answering
766
+ what: college mathematics
767
+ who: various online sources
768
+ when: before 2021
769
+ language: English
770
+
771
+ - name: mmlu_college_medicine
772
+ display_name: College Medicine
773
+ short_display_name: College Medicine
774
+ description: The college medicine subject in the Massive Multitask Language Understanding (MMLU) benchmark.
775
+ metric_groups:
776
+ - accuracy
777
+ - efficiency
778
+ - general_information
779
+ environment:
780
+ main_name: exact_match
781
+ main_split: test
782
+ taxonomy:
783
+ task: multiple-choice question answering
784
+ what: college medicine
785
+ who: various online sources
786
+ when: before 2021
787
+ language: English
788
+
789
+ - name: mmlu_college_physics
790
+ display_name: College Physics
791
+ short_display_name: College Physics
792
+ description: The college physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
793
+ metric_groups:
794
+ - accuracy
795
+ - efficiency
796
+ - general_information
797
+ environment:
798
+ main_name: exact_match
799
+ main_split: test
800
+ taxonomy:
801
+ task: multiple-choice question answering
802
+ what: college physics
803
+ who: various online sources
804
+ when: before 2021
805
+ language: English
806
+
807
+ - name: mmlu_conceptual_physics
808
+ display_name: Conceptual Physics
809
+ short_display_name: Conceptual Physics
810
+ description: The conceptual physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
811
+ metric_groups:
812
+ - accuracy
813
+ - efficiency
814
+ - general_information
815
+ environment:
816
+ main_name: exact_match
817
+ main_split: test
818
+ taxonomy:
819
+ task: multiple-choice question answering
820
+ what: conceptual physics
821
+ who: various online sources
822
+ when: before 2021
823
+ language: English
824
+
825
+ - name: mmlu_electrical_engineering
826
+ display_name: Electrical Engineering
827
+ short_display_name: Electrical Engineering
828
+ description: The electrical engineering subject in the Massive Multitask Language Understanding (MMLU) benchmark.
829
+ metric_groups:
830
+ - accuracy
831
+ - efficiency
832
+ - general_information
833
+ environment:
834
+ main_name: exact_match
835
+ main_split: test
836
+ taxonomy:
837
+ task: multiple-choice question answering
838
+ what: electrical engineering
839
+ who: various online sources
840
+ when: before 2021
841
+ language: English
842
+
843
+ - name: mmlu_elementary_mathematics
844
+ display_name: Elementary Mathematics
845
+ short_display_name: Elementary Mathematics
846
+ description: The elementary mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
847
+ metric_groups:
848
+ - accuracy
849
+ - efficiency
850
+ - general_information
851
+ environment:
852
+ main_name: exact_match
853
+ main_split: test
854
+ taxonomy:
855
+ task: multiple-choice question answering
856
+ what: elementary mathematics
857
+ who: various online sources
858
+ when: before 2021
859
+ language: English
860
+
861
+ - name: mmlu_formal_logic
862
+ display_name: Formal Logic
863
+ short_display_name: Formal Logic
864
+ description: The formal logic subject in the Massive Multitask Language Understanding (MMLU) benchmark.
865
+ metric_groups:
866
+ - accuracy
867
+ - efficiency
868
+ - general_information
869
+ environment:
870
+ main_name: exact_match
871
+ main_split: test
872
+ taxonomy:
873
+ task: multiple-choice question answering
874
+ what: formal logic
875
+ who: various online sources
876
+ when: before 2021
877
+ language: English
878
+
879
+ - name: mmlu_high_school_biology
880
+ display_name: High School Biology
881
+ short_display_name: High School Biology
882
+ description: The high school biology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
883
+ metric_groups:
884
+ - accuracy
885
+ - efficiency
886
+ - general_information
887
+ environment:
888
+ main_name: exact_match
889
+ main_split: test
890
+ taxonomy:
891
+ task: multiple-choice question answering
892
+ what: high school biology
893
+ who: various online sources
894
+ when: before 2021
895
+ language: English
896
+
897
+ - name: mmlu_high_school_chemistry
898
+ display_name: High School Chemistry
899
+ short_display_name: High School Chemistry
900
+ description: The high school chemistry subject in the Massive Multitask Language Understanding (MMLU) benchmark.
901
+ metric_groups:
902
+ - accuracy
903
+ - efficiency
904
+ - general_information
905
+ environment:
906
+ main_name: exact_match
907
+ main_split: test
908
+ taxonomy:
909
+ task: multiple-choice question answering
910
+ what: high school chemistry
911
+ who: various online sources
912
+ when: before 2021
913
+ language: English
914
+
915
+ - name: mmlu_high_school_computer_science
916
+ display_name: High School Computer Science
917
+ short_display_name: High School Computer Science
918
+ description: The high school computer science subject in the Massive Multitask Language Understanding (MMLU) benchmark.
919
+ metric_groups:
920
+ - accuracy
921
+ - efficiency
922
+ - general_information
923
+ environment:
924
+ main_name: exact_match
925
+ main_split: test
926
+ taxonomy:
927
+ task: multiple-choice question answering
928
+ what: high school computer science
929
+ who: various online sources
930
+ when: before 2021
931
+ language: English
932
+
933
+ - name: mmlu_high_school_european_history
934
+ display_name: High School European History
935
+ short_display_name: High School European History
936
+ description: The high school european history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
937
+ metric_groups:
938
+ - accuracy
939
+ - efficiency
940
+ - general_information
941
+ environment:
942
+ main_name: exact_match
943
+ main_split: test
944
+ taxonomy:
945
+ task: multiple-choice question answering
946
+ what: high school european history
947
+ who: various online sources
948
+ when: before 2021
949
+ language: English
950
+
951
+ - name: mmlu_high_school_geography
952
+ display_name: High School Geography
953
+ short_display_name: High School Geography
954
+ description: The high school geography subject in the Massive Multitask Language Understanding (MMLU) benchmark.
955
+ metric_groups:
956
+ - accuracy
957
+ - efficiency
958
+ - general_information
959
+ environment:
960
+ main_name: exact_match
961
+ main_split: test
962
+ taxonomy:
963
+ task: multiple-choice question answering
964
+ what: high school geography
965
+ who: various online sources
966
+ when: before 2021
967
+ language: English
968
+
969
+ - name: mmlu_high_school_government_and_politics
970
+ display_name: High School Government And Politics
971
+ short_display_name: High School Government And Politics
972
+ description: The high school government and politics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
973
+ metric_groups:
974
+ - accuracy
975
+ - efficiency
976
+ - general_information
977
+ environment:
978
+ main_name: exact_match
979
+ main_split: test
980
+ taxonomy:
981
+ task: multiple-choice question answering
982
+ what: high school government and politics
983
+ who: various online sources
984
+ when: before 2021
985
+ language: English
986
+
987
+ - name: mmlu_high_school_macroeconomics
988
+ display_name: High School Macroeconomics
989
+ short_display_name: High School Macroeconomics
990
+ description: The high school macroeconomics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
991
+ metric_groups:
992
+ - accuracy
993
+ - efficiency
994
+ - general_information
995
+ environment:
996
+ main_name: exact_match
997
+ main_split: test
998
+ taxonomy:
999
+ task: multiple-choice question answering
1000
+ what: high school macroeconomics
1001
+ who: various online sources
1002
+ when: before 2021
1003
+ language: English
1004
+
1005
+ - name: mmlu_high_school_mathematics
1006
+ display_name: High School Mathematics
1007
+ short_display_name: High School Mathematics
1008
+ description: The high school mathematics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1009
+ metric_groups:
1010
+ - accuracy
1011
+ - efficiency
1012
+ - general_information
1013
+ environment:
1014
+ main_name: exact_match
1015
+ main_split: test
1016
+ taxonomy:
1017
+ task: multiple-choice question answering
1018
+ what: high school mathematics
1019
+ who: various online sources
1020
+ when: before 2021
1021
+ language: English
1022
+
1023
+ - name: mmlu_high_school_microeconomics
1024
+ display_name: High School Microeconomics
1025
+ short_display_name: High School Microeconomics
1026
+ description: The high school microeconomics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1027
+ metric_groups:
1028
+ - accuracy
1029
+ - efficiency
1030
+ - general_information
1031
+ environment:
1032
+ main_name: exact_match
1033
+ main_split: test
1034
+ taxonomy:
1035
+ task: multiple-choice question answering
1036
+ what: high school microeconomics
1037
+ who: various online sources
1038
+ when: before 2021
1039
+ language: English
1040
+
1041
+ - name: mmlu_high_school_physics
1042
+ display_name: High School Physics
1043
+ short_display_name: High School Physics
1044
+ description: The high school physics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1045
+ metric_groups:
1046
+ - accuracy
1047
+ - efficiency
1048
+ - general_information
1049
+ environment:
1050
+ main_name: exact_match
1051
+ main_split: test
1052
+ taxonomy:
1053
+ task: multiple-choice question answering
1054
+ what: high school physics
1055
+ who: various online sources
1056
+ when: before 2021
1057
+ language: English
1058
+
1059
+ - name: mmlu_high_school_psychology
1060
+ display_name: High School Psychology
1061
+ short_display_name: High School Psychology
1062
+ description: The high school psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1063
+ metric_groups:
1064
+ - accuracy
1065
+ - efficiency
1066
+ - general_information
1067
+ environment:
1068
+ main_name: exact_match
1069
+ main_split: test
1070
+ taxonomy:
1071
+ task: multiple-choice question answering
1072
+ what: high school psychology
1073
+ who: various online sources
1074
+ when: before 2021
1075
+ language: English
1076
+
1077
+ - name: mmlu_high_school_statistics
1078
+ display_name: High School Statistics
1079
+ short_display_name: High School Statistics
1080
+ description: The high school statistics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1081
+ metric_groups:
1082
+ - accuracy
1083
+ - efficiency
1084
+ - general_information
1085
+ environment:
1086
+ main_name: exact_match
1087
+ main_split: test
1088
+ taxonomy:
1089
+ task: multiple-choice question answering
1090
+ what: high school statistics
1091
+ who: various online sources
1092
+ when: before 2021
1093
+ language: English
1094
+
1095
+ - name: mmlu_high_school_us_history
1096
+ display_name: High School US History
1097
+ short_display_name: High School US History
1098
+ description: The high school us history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1099
+ metric_groups:
1100
+ - accuracy
1101
+ - efficiency
1102
+ - general_information
1103
+ environment:
1104
+ main_name: exact_match
1105
+ main_split: test
1106
+ taxonomy:
1107
+ task: multiple-choice question answering
1108
+ what: high school us history
1109
+ who: various online sources
1110
+ when: before 2021
1111
+ language: English
1112
+
1113
+ - name: mmlu_high_school_world_history
1114
+ display_name: High School World History
1115
+ short_display_name: High School World History
1116
+ description: The high school world history subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1117
+ metric_groups:
1118
+ - accuracy
1119
+ - efficiency
1120
+ - general_information
1121
+ environment:
1122
+ main_name: exact_match
1123
+ main_split: test
1124
+ taxonomy:
1125
+ task: multiple-choice question answering
1126
+ what: high school world history
1127
+ who: various online sources
1128
+ when: before 2021
1129
+ language: English
1130
+
1131
+ - name: mmlu_human_aging
1132
+ display_name: Human Aging
1133
+ short_display_name: Human Aging
1134
+ description: The human aging subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1135
+ metric_groups:
1136
+ - accuracy
1137
+ - efficiency
1138
+ - general_information
1139
+ environment:
1140
+ main_name: exact_match
1141
+ main_split: test
1142
+ taxonomy:
1143
+ task: multiple-choice question answering
1144
+ what: human aging
1145
+ who: various online sources
1146
+ when: before 2021
1147
+ language: English
1148
+
1149
+ - name: mmlu_human_sexuality
1150
+ display_name: Human Sexuality
1151
+ short_display_name: Human Sexuality
1152
+ description: The human sexuality subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1153
+ metric_groups:
1154
+ - accuracy
1155
+ - efficiency
1156
+ - general_information
1157
+ environment:
1158
+ main_name: exact_match
1159
+ main_split: test
1160
+ taxonomy:
1161
+ task: multiple-choice question answering
1162
+ what: human sexuality
1163
+ who: various online sources
1164
+ when: before 2021
1165
+ language: English
1166
+
1167
+ - name: mmlu_international_law
1168
+ display_name: International Law
1169
+ short_display_name: International Law
1170
+ description: The international law subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1171
+ metric_groups:
1172
+ - accuracy
1173
+ - efficiency
1174
+ - general_information
1175
+ environment:
1176
+ main_name: exact_match
1177
+ main_split: test
1178
+ taxonomy:
1179
+ task: multiple-choice question answering
1180
+ what: international law
1181
+ who: various online sources
1182
+ when: before 2021
1183
+ language: English
1184
+
1185
+ - name: mmlu_logical_fallacies
1186
+ display_name: Logical Fallacies
1187
+ short_display_name: Logical Fallacies
1188
+ description: The logical fallacies subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1189
+ metric_groups:
1190
+ - accuracy
1191
+ - efficiency
1192
+ - general_information
1193
+ environment:
1194
+ main_name: exact_match
1195
+ main_split: test
1196
+ taxonomy:
1197
+ task: multiple-choice question answering
1198
+ what: logical fallacies
1199
+ who: various online sources
1200
+ when: before 2021
1201
+ language: English
1202
+
1203
+ - name: mmlu_machine_learning
1204
+ display_name: Machine Learning
1205
+ short_display_name: Machine Learning
1206
+ description: The machine learning subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1207
+ metric_groups:
1208
+ - accuracy
1209
+ - efficiency
1210
+ - general_information
1211
+ environment:
1212
+ main_name: exact_match
1213
+ main_split: test
1214
+ taxonomy:
1215
+ task: multiple-choice question answering
1216
+ what: machine learning
1217
+ who: various online sources
1218
+ when: before 2021
1219
+ language: English
1220
+
1221
+ - name: mmlu_management
1222
+ display_name: Management
1223
+ short_display_name: Management
1224
+ description: The management subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1225
+ metric_groups:
1226
+ - accuracy
1227
+ - efficiency
1228
+ - general_information
1229
+ environment:
1230
+ main_name: exact_match
1231
+ main_split: test
1232
+ taxonomy:
1233
+ task: multiple-choice question answering
1234
+ what: management
1235
+ who: various online sources
1236
+ when: before 2021
1237
+ language: English
1238
+
1239
+ - name: mmlu_marketing
1240
+ display_name: Marketing
1241
+ short_display_name: Marketing
1242
+ description: The marketing subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1243
+ metric_groups:
1244
+ - accuracy
1245
+ - efficiency
1246
+ - general_information
1247
+ environment:
1248
+ main_name: exact_match
1249
+ main_split: test
1250
+ taxonomy:
1251
+ task: multiple-choice question answering
1252
+ what: marketing
1253
+ who: various online sources
1254
+ when: before 2021
1255
+ language: English
1256
+
1257
+ - name: mmlu_medical_genetics
1258
+ display_name: Medical Genetics
1259
+ short_display_name: Medical Genetics
1260
+ description: The medical genetics subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1261
+ metric_groups:
1262
+ - accuracy
1263
+ - efficiency
1264
+ - general_information
1265
+ environment:
1266
+ main_name: exact_match
1267
+ main_split: test
1268
+ taxonomy:
1269
+ task: multiple-choice question answering
1270
+ what: medical genetics
1271
+ who: various online sources
1272
+ when: before 2021
1273
+ language: English
1274
+
1275
+ - name: mmlu_miscellaneous
1276
+ display_name: Miscellaneous
1277
+ short_display_name: Miscellaneous
1278
+ description: The miscellaneous subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1279
+ metric_groups:
1280
+ - accuracy
1281
+ - efficiency
1282
+ - general_information
1283
+ environment:
1284
+ main_name: exact_match
1285
+ main_split: test
1286
+ taxonomy:
1287
+ task: multiple-choice question answering
1288
+ what: miscellaneous
1289
+ who: various online sources
1290
+ when: before 2021
1291
+ language: English
1292
+
1293
+ - name: mmlu_moral_disputes
1294
+ display_name: Moral Disputes
1295
+ short_display_name: Moral Disputes
1296
+ description: The moral disputes subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1297
+ metric_groups:
1298
+ - accuracy
1299
+ - efficiency
1300
+ - general_information
1301
+ environment:
1302
+ main_name: exact_match
1303
+ main_split: test
1304
+ taxonomy:
1305
+ task: multiple-choice question answering
1306
+ what: moral disputes
1307
+ who: various online sources
1308
+ when: before 2021
1309
+ language: English
1310
+
1311
+ - name: mmlu_moral_scenarios
1312
+ display_name: Moral Scenarios
1313
+ short_display_name: Moral Scenarios
1314
+ description: The moral scenarios subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1315
+ metric_groups:
1316
+ - accuracy
1317
+ - efficiency
1318
+ - general_information
1319
+ environment:
1320
+ main_name: exact_match
1321
+ main_split: test
1322
+ taxonomy:
1323
+ task: multiple-choice question answering
1324
+ what: moral scenarios
1325
+ who: various online sources
1326
+ when: before 2021
1327
+ language: English
1328
+
1329
+ - name: mmlu_nutrition
1330
+ display_name: Nutrition
1331
+ short_display_name: Nutrition
1332
+ description: The nutrition subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1333
+ metric_groups:
1334
+ - accuracy
1335
+ - efficiency
1336
+ - general_information
1337
+ environment:
1338
+ main_name: exact_match
1339
+ main_split: test
1340
+ taxonomy:
1341
+ task: multiple-choice question answering
1342
+ what: nutrition
1343
+ who: various online sources
1344
+ when: before 2021
1345
+ language: English
1346
+
1347
+ - name: mmlu_prehistory
1348
+ display_name: Prehistory
1349
+ short_display_name: Prehistory
1350
+ description: The prehistory subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1351
+ metric_groups:
1352
+ - accuracy
1353
+ - efficiency
1354
+ - general_information
1355
+ environment:
1356
+ main_name: exact_match
1357
+ main_split: test
1358
+ taxonomy:
1359
+ task: multiple-choice question answering
1360
+ what: prehistory
1361
+ who: various online sources
1362
+ when: before 2021
1363
+ language: English
1364
+
1365
+ - name: mmlu_professional_accounting
1366
+ display_name: Professional Accounting
1367
+ short_display_name: Professional Accounting
1368
+ description: The professional accounting subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1369
+ metric_groups:
1370
+ - accuracy
1371
+ - efficiency
1372
+ - general_information
1373
+ environment:
1374
+ main_name: exact_match
1375
+ main_split: test
1376
+ taxonomy:
1377
+ task: multiple-choice question answering
1378
+ what: professional accounting
1379
+ who: various online sources
1380
+ when: before 2021
1381
+ language: English
1382
+
1383
+ - name: mmlu_professional_law
1384
+ display_name: Professional Law
1385
+ short_display_name: Professional Law
1386
+ description: The professional law subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1387
+ metric_groups:
1388
+ - accuracy
1389
+ - efficiency
1390
+ - general_information
1391
+ environment:
1392
+ main_name: exact_match
1393
+ main_split: test
1394
+ taxonomy:
1395
+ task: multiple-choice question answering
1396
+ what: professional law
1397
+ who: various online sources
1398
+ when: before 2021
1399
+ language: English
1400
+
1401
+ - name: mmlu_professional_psychology
1402
+ display_name: Professional Psychology
1403
+ short_display_name: Professional Psychology
1404
+ description: The professional psychology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1405
+ metric_groups:
1406
+ - accuracy
1407
+ - efficiency
1408
+ - general_information
1409
+ environment:
1410
+ main_name: exact_match
1411
+ main_split: test
1412
+ taxonomy:
1413
+ task: multiple-choice question answering
1414
+ what: professional psychology
1415
+ who: various online sources
1416
+ when: before 2021
1417
+ language: English
1418
+
1419
+ - name: mmlu_public_relations
1420
+ display_name: Public Relations
1421
+ short_display_name: Public Relations
1422
+ description: The public relations subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1423
+ metric_groups:
1424
+ - accuracy
1425
+ - efficiency
1426
+ - general_information
1427
+ environment:
1428
+ main_name: exact_match
1429
+ main_split: test
1430
+ taxonomy:
1431
+ task: multiple-choice question answering
1432
+ what: public relations
1433
+ who: various online sources
1434
+ when: before 2021
1435
+ language: English
1436
+
1437
+ - name: mmlu_security_studies
1438
+ display_name: Security Studies
1439
+ short_display_name: Security Studies
1440
+ description: The security studies subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1441
+ metric_groups:
1442
+ - accuracy
1443
+ - efficiency
1444
+ - general_information
1445
+ environment:
1446
+ main_name: exact_match
1447
+ main_split: test
1448
+ taxonomy:
1449
+ task: multiple-choice question answering
1450
+ what: security studies
1451
+ who: various online sources
1452
+ when: before 2021
1453
+ language: English
1454
+
1455
+ - name: mmlu_sociology
1456
+ display_name: Sociology
1457
+ short_display_name: Sociology
1458
+ description: The sociology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1459
+ metric_groups:
1460
+ - accuracy
1461
+ - efficiency
1462
+ - general_information
1463
+ environment:
1464
+ main_name: exact_match
1465
+ main_split: test
1466
+ taxonomy:
1467
+ task: multiple-choice question answering
1468
+ what: sociology
1469
+ who: various online sources
1470
+ when: before 2021
1471
+ language: English
1472
+
1473
+ - name: mmlu_virology
1474
+ display_name: Virology
1475
+ short_display_name: Virology
1476
+ description: The virology subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1477
+ metric_groups:
1478
+ - accuracy
1479
+ - efficiency
1480
+ - general_information
1481
+ environment:
1482
+ main_name: exact_match
1483
+ main_split: test
1484
+ taxonomy:
1485
+ task: multiple-choice question answering
1486
+ what: virology
1487
+ who: various online sources
1488
+ when: before 2021
1489
+ language: English
1490
+
1491
+ - name: mmlu_world_religions
1492
+ display_name: World Religions
1493
+ short_display_name: World Religions
1494
+ description: The world religions subject in the Massive Multitask Language Understanding (MMLU) benchmark.
1495
+ metric_groups:
1496
+ - accuracy
1497
+ - efficiency
1498
+ - general_information
1499
+ environment:
1500
+ main_name: exact_match
1501
+ main_split: test
1502
+ taxonomy:
1503
+ task: multiple-choice question answering
1504
+ what: world religions
1505
+ who: various online sources
1506
+ when: before 2021
1507
+ language: English