crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,823 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: multiple_choice_joint
10
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
+ - name: multiple_choice_separate_original
12
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
+ - name: multiple_choice_separate_calibrated
14
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
+ - name: language_modeling
16
+ description: Given the input, the model assigns the sequence a probability.
17
+ - name: instructions
18
+ description: The description of the task that is included at the very beginning of the prompt.
19
+ - name: global_prefix
20
+ description: The string that is prepended to the prompt.
21
+ - name: global_suffix
22
+ description: The string that is appended to the prompt.
23
+ - name: instance_prefix
24
+ description: The string that is included before each instance (e.g., '\n\n').
25
+ - name: input_prefix
26
+ description: The string that is included before each input (e.g., 'Question:').
27
+ - name: input_suffix
28
+ description: The string that is included after each input (e.g., '\n').
29
+ - name: reference_prefix
30
+ description: The string that is included before each reference (for multiple-choice questions).
31
+ - name: reference_suffix
32
+ description: The string that is included after each reference (for multiple-choice questions).
33
+ - name: output_prefix
34
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
+ - name: output_suffix
36
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
+ - name: substitutions
38
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
+ - name: max_train_instances
40
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
+ - name: max_eval_instances
42
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
+ - name: num_outputs
44
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
+ - name: num_train_trials
46
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
+ - name: sample_train
48
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
+ - name: model
50
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
+ - name: model_deployment
52
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
+ - name: temperature
54
+ description: Temperature parameter used in generation.
55
+ - name: max_tokens
56
+ description: Maximum number of tokens to generate.
57
+ - name: stop_sequences
58
+ description: List of sequences, where we stop generation if we encounter any of them.
59
+ - name: random
60
+ description: Random seed (string), which guarantees reproducibility.
61
+ - name: multi_label
62
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
+
64
+ ############################################################
65
+ metrics:
66
+ # Infrastructure metrics:
67
+ - name: num_perplexity_tokens
68
+ display_name: '# tokens'
69
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
70
+ - name: num_bytes
71
+ display_name: '# bytes'
72
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
73
+
74
+ - name: num_references
75
+ display_name: '# ref'
76
+ description: Number of references.
77
+ - name: num_train_trials
78
+ display_name: '# trials'
79
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
80
+ - name: estimated_num_tokens_cost
81
+ display_name: 'cost'
82
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
83
+ - name: num_prompt_tokens
84
+ display_name: '# prompt tokens'
85
+ description: Number of tokens in the prompt.
86
+ - name: num_prompt_characters
87
+ display_name: '# prompt chars'
88
+ description: Number of characters in the prompt.
89
+ - name: num_completion_tokens
90
+ display_name: '# completion tokens'
91
+ description: Actual number of completion tokens (over all completions).
92
+ - name: num_output_tokens
93
+ display_name: '# output tokens'
94
+ description: Actual number of output tokens.
95
+ - name: max_num_output_tokens
96
+ display_name: 'Max output tokens'
97
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
98
+ - name: num_requests
99
+ display_name: '# requests'
100
+ description: Number of distinct API requests.
101
+ - name: num_instances
102
+ display_name: '# eval'
103
+ description: Number of evaluation instances.
104
+ - name: num_train_instances
105
+ display_name: '# train'
106
+ description: Number of training instances (e.g., in-context examples).
107
+ - name: prompt_truncated
108
+ display_name: truncated
109
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
110
+ - name: finish_reason_length
111
+ display_name: finish b/c length
112
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
113
+ - name: finish_reason_stop
114
+ display_name: finish b/c stop
115
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
116
+ - name: finish_reason_endoftext
117
+ display_name: finish b/c endoftext
118
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
119
+ - name: finish_reason_unknown
120
+ display_name: finish b/c unknown
121
+ description: Fraction of instances where the the output was terminated for unknown reasons.
122
+ - name: num_completions
123
+ display_name: '# completions'
124
+ description: Number of completions.
125
+ - name: predicted_index
126
+ display_name: Predicted index
127
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
128
+
129
+ # Vision Language metrics [text]:
130
+ - name: edit_similarity
131
+ display_name: Edit similarity (Levenshtein)
132
+ short_display_name: Edit sim.
133
+ lower_is_better: false
134
+ description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
135
+
136
+ # Vision Language metrics [image]:
137
+ - name: earth_mover_similarity
138
+ display_name: Earth Mover Similarity
139
+ short_display_name: EMD-Sim
140
+ description: 1 - Earth Mover Distance [(Rubner and Tomasi, 2000)](https://www.cs.cmu.edu/~efros/courses/LBMV07/Papers/rubner-jcviu-00.pdf) between an image generated by the model and the target image.
141
+ lower_is_better: false
142
+ - name: pixel_similarity
143
+ display_name: Pixel Similarity
144
+ short_display_name: PS
145
+ description: Pixel Similarity between an image generated by the model and the target image.
146
+ lower_is_better: false
147
+ - name: sift_similarity
148
+ display_name: SIFT Similarity
149
+ short_display_name: SIFT
150
+ description: SIFT Similarity (Scale-Invariant Feature Transform) [(Lowe, 1999)](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=790410) between an image generated by the model and the target image.
151
+ lower_is_better: false
152
+ - name: compilation_success
153
+ display_name: Compilation success
154
+ description: Fraction of instances where the generated code compiles successfully.
155
+ lower_is_better: false
156
+ - name: lpips_similarity
157
+ display_name: LPIPS similarity
158
+ short_display_name: LPIPS
159
+ description: LPIPS similarity (Learned Perceptual Image Patch Similarity) [(Zhang et al., 2018)](https://arxiv.org/abs/1801.03924) between an image generated by the model and the target image.
160
+ lower_is_better: false
161
+ - name: fid_similarity
162
+ display_name: FID similarity
163
+ short_display_name: FID
164
+ description: FID similarity (Fréchet Inception Distance) [(Heusel et al., 2017)](https://arxiv.org/abs/1706.08500) between an image generated by the model and the target image.
165
+ lower_is_better: false
166
+ - name: ssim_similarity
167
+ display_name: SSIM
168
+ short_display_name: SSIM
169
+ description: SSIM similarity (Structural Similarity Index) [(Wang et al., 2004)](https://www.cns.nyu.edu/pub/eero/wang03-reprint.pdf) between an image generated by the model and the target image.
170
+ lower_is_better: false
171
+
172
+ # Accuracy metrics:
173
+ - name: exact_match
174
+ display_name: Exact match
175
+ short_display_name: EM
176
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
177
+ lower_is_better: false
178
+ - name: quasi_exact_match
179
+ display_name: Quasi-exact match
180
+ short_display_name: EM
181
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
182
+ lower_is_better: false
183
+ - name: prefix_exact_match
184
+ display_name: Prefix exact match
185
+ short_display_name: PEM
186
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
187
+ lower_is_better: false
188
+ - name: quasi_prefix_exact_match
189
+ # TODO: should call this prefix_quasi_exact_match
190
+ display_name: Prefix quasi-exact match
191
+ short_display_name: PEM
192
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
193
+ lower_is_better: false
194
+
195
+ - name: rouge_1
196
+ display_name: ROUGE-1
197
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
198
+ lower_is_better: false
199
+ - name: rouge_2
200
+ display_name: ROUGE-2
201
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
202
+ lower_is_better: false
203
+ - name: rouge_l
204
+ display_name: ROUGE-L
205
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
206
+ lower_is_better: false
207
+ - name: bleu_1
208
+ display_name: BLEU-1
209
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
210
+ lower_is_better: false
211
+ - name: bleu_4
212
+ display_name: BLEU-4
213
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
214
+ lower_is_better: false
215
+ - name: f1_score
216
+ display_name: F1
217
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
218
+ lower_is_better: false
219
+ - name: cider
220
+ display_name: CIDEr
221
+ description: Evaluates the quality of generated caption by measuring the weighted similarity of n-grams between the captions and a set of human-written reference captions, emphasizing informativeness and consensus.
222
+ lower_is_better: false
223
+
224
+ # Bias metrics:
225
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
226
+ display_name: Stereotypical associations (race, profession)
227
+ short_display_name: Stereotypes (race)
228
+ lower_is_better: true
229
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
230
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
231
+ display_name: Stereotypical associations (race, adjectives)
232
+ short_display_name: Stereotypes (race)
233
+ lower_is_better: true
234
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
235
+
236
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
237
+ display_name: Stereotypical associations (gender, profession)
238
+ short_display_name: Stereotypes (gender)
239
+ lower_is_better: true
240
+ description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
241
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
242
+ display_name: Stereotypical associations (gender, adjectives)
243
+ short_display_name: Stereotypes (gender)
244
+ lower_is_better: true
245
+ description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
246
+
247
+ - name: bias_metric:mode=representation,demographic_category=race
248
+ display_name: Demographic representation (race)
249
+ short_display_name: Representation (race)
250
+ lower_is_better: true
251
+ description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
252
+ - name: bias_metric:mode=representation,demographic_category=gender
253
+ display_name: Demographic representation (gender)
254
+ short_display_name: Representation (gender)
255
+ lower_is_better: true
256
+ description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
257
+
258
+ - name: bbq_metric_ambiguous_bias
259
+ display_name: BBQ (ambiguous)
260
+ lower_is_better: true
261
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
262
+ - name: bbq_metric_unambiguous_bias
263
+ display_name: BBQ (unambiguous)
264
+ lower_is_better: true
265
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
266
+
267
+ # Toxicity metrics
268
+ - name: expected_max_toxicity
269
+ display_name: Average max toxicity
270
+ short_display_name: Avg. max toxicity
271
+ lower_is_better: true
272
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
273
+ - name: max_toxicity_probability
274
+ display_name: max toxicity
275
+ lower_is_better: true
276
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
277
+ - name: toxic_frac
278
+ display_name: Toxic fraction
279
+ lower_is_better: true
280
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
281
+
282
+ ############################################################
283
+ perturbations:
284
+ - name: robustness
285
+ display_name: Robustness
286
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
287
+ - name: fairness
288
+ display_name: Fairness
289
+ description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
290
+ - name: typos
291
+ display_name: Typos
292
+ description: >
293
+ Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
294
+ performance between perturbed and unperturbed versions.
295
+ - name: synonym
296
+ display_name: Synonyms
297
+ description: >
298
+ Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
299
+ worst-case performance between perturbed and unperturbed versions.
300
+ - name: dialect
301
+ display_name: SAE -> AAE
302
+ short_display_name: Dialect
303
+ description: >
304
+ Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
305
+ - name: race
306
+ display_name: First names by race (White -> Black)
307
+ short_display_name: Race
308
+ description: >
309
+ Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
310
+ - name: gender
311
+ display_name: Pronouns by gender (Male -> Female)
312
+ short_display_name: Gender
313
+ description: >
314
+ Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
315
+ performance between perturbed and unperturbed versions.
316
+
317
+ ############################################################
318
+ metric_groups:
319
+ - name: accuracy
320
+ display_name: Accuracy
321
+ metrics:
322
+ - name: ${main_name}
323
+ split: ${main_split}
324
+
325
+ - name: efficiency
326
+ display_name: Efficiency
327
+ metrics:
328
+ - name: inference_runtime
329
+ split: ${main_split}
330
+
331
+ - name: general_information
332
+ display_name: General information
333
+ metrics:
334
+ - name: num_instances
335
+ split: ${main_split}
336
+ - name: num_train_instances
337
+ split: ${main_split}
338
+ - name: prompt_truncated
339
+ split: ${main_split}
340
+ - name: num_prompt_tokens
341
+ split: ${main_split}
342
+ - name: num_output_tokens
343
+ split: ${main_split}
344
+
345
+ - name: toxicity
346
+ display_name: Toxicity
347
+ metrics:
348
+ - name: toxic_frac
349
+ split: ${main_split}
350
+
351
+ - name: generation_image
352
+ display_name: Generation (image)
353
+ metrics:
354
+ - name: pixel_similarity
355
+ split: ${main_split}
356
+ - name: compilation_success
357
+ split: ${main_split}
358
+ - name: fid_similarity
359
+ split: ${main_split}
360
+ - name: earth_mover_similarity
361
+ split: ${main_split}
362
+
363
+ - name: generation_text
364
+ display_name: Generation (text)
365
+ metrics:
366
+ - name: edit_similarity
367
+ split: ${main_split}
368
+
369
+ ############################################################
370
+ run_groups:
371
+ - name: core_scenarios
372
+ display_name: Core scenarios
373
+ description: The scenarios where we evaluate all the models.
374
+ category: All scenarios
375
+ subgroups:
376
+ - hateful_memes
377
+ - heim_human_eval
378
+ - viz_wiz
379
+ - vqa
380
+ - mmmu
381
+ - image2structure
382
+ - unicorn
383
+ - bingo
384
+ - multipanelvqa
385
+ - pope
386
+ - seed_bench
387
+ - mme
388
+
389
+ - name: a_okvqa
390
+ display_name: A-OKVQA
391
+ description: A crowdsourced dataset composed of a diverse set of about 25K questions requiring a broad base of commonsense and world knowledge to answer ([paper](https://arxiv.org/abs/2206.01718)).
392
+ metric_groups:
393
+ - accuracy
394
+ - efficiency
395
+ - general_information
396
+ environment:
397
+ main_name: exact_match
398
+ main_split: valid
399
+ taxonomy:
400
+ task: multiple-choice question answering
401
+ what: Real-world images
402
+ who: Human experts
403
+ when: "2023"
404
+ language: English
405
+
406
+ - name: crossmodal_3600
407
+ display_name: Crossmodal 3600
408
+ description: Crossmodal-3600 dataset (XM3600 in short), a geographically-diverse set of 3600 images annotated with human-generated reference captions in 36 languages. ([paper](https://arxiv.org/abs/2205.12522))
409
+ metric_groups:
410
+ - accuracy
411
+ - efficiency
412
+ - general_information
413
+ environment:
414
+ main_name: f1_score
415
+ main_split: test
416
+ taxonomy:
417
+ task: multilingual captioning
418
+ what: Real-world images
419
+ who: Human experts
420
+ when: "2022"
421
+ language: 36 languages
422
+
423
+ - name: flickr30k
424
+ display_name: Flickr30k
425
+ description: An image caption corpus consisting of 158,915 crowd-sourced captions describing 31,783 Flickr images. ([paper](https://shannon.cs.illinois.edu/DenotationGraph/TACLDenotationGraph.pdf))
426
+ metric_groups:
427
+ - accuracy
428
+ - efficiency
429
+ - general_information
430
+ environment:
431
+ main_name: f1_score
432
+ main_split: test
433
+ taxonomy:
434
+ task: image captioning
435
+ what: Flickr images
436
+ who: Human experts
437
+ when: "2014"
438
+ language: English
439
+
440
+ - name: gqa
441
+ display_name: GQA
442
+ description: Questions about real-world visual reasoning and compositional QA
443
+ metric_groups:
444
+ - accuracy
445
+ - efficiency
446
+ - general_information
447
+ environment:
448
+ main_name: f1_score
449
+ main_split: valid
450
+ taxonomy:
451
+ task: short answer question answering
452
+ what: Real-world images
453
+ who: Human experts
454
+ when: "2019"
455
+ language: English
456
+
457
+ - name: heim_human_eval
458
+ display_name: HEIM Human Eval Scenario
459
+ description: Seeing if we can use VLMs to evaluate AI-generated images from HEIM
460
+ metric_groups:
461
+ - accuracy
462
+ - efficiency
463
+ - general_information
464
+ environment:
465
+ main_name: exact_match
466
+ main_split: test
467
+ taxonomy:
468
+ task: multiple-choice question answering
469
+ what: AI-generated images
470
+ who: Text-to-image models
471
+ when: "2024"
472
+ language: English
473
+
474
+ - name: image2structure
475
+ display_name: Image2Structure
476
+ description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
477
+ category: All scenarios
478
+ subgroups:
479
+ - image2latex
480
+ - image2webpage
481
+ - image2musicsheet
482
+
483
+ - name: hateful_memes
484
+ display_name: Hateful Memes
485
+ description: The Hateful Memes benchmark for multimodal hate speech detection [(Dwibedi et al., 2020)](https://arxiv.org/pdf/2005.04790.pdf).
486
+ metric_groups:
487
+ - accuracy
488
+ - efficiency
489
+ - general_information
490
+ environment:
491
+ main_name: exact_match
492
+ main_split: test
493
+ taxonomy:
494
+ task: multimodal classification
495
+ what: images and text
496
+ who: annotators from Amazon Mechanical Turk
497
+ when: "2020"
498
+ language: English
499
+
500
+ - name: mm_safety_bench
501
+ display_name: MM-SafetyBench
502
+ description: Expose the vulnerability of open-source VLMs with toxic and biased content ([paper](https://arxiv.org/abs/2311.17600))
503
+ metric_groups:
504
+ - accuracy
505
+ - efficiency
506
+ - toxicity
507
+ - general_information
508
+ environment:
509
+ main_name: toxic_frac
510
+ main_split: test
511
+ taxonomy:
512
+ task: safety
513
+ what: safety images
514
+ who: Human experts
515
+ when: "2023"
516
+ language: English
517
+
518
+ - name: mscoco_captioning
519
+ display_name: MSCOCO (captioning)
520
+ description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
521
+ metric_groups:
522
+ - accuracy
523
+ - efficiency
524
+ - general_information
525
+ environment:
526
+ main_name: f1_score
527
+ main_split: valid
528
+ taxonomy:
529
+ task: image captioning
530
+ what: Real world images
531
+ who: Human experts
532
+ when: "2014"
533
+ language: English
534
+
535
+ - name: mscoco_categorization
536
+ display_name: MSCOCO (categorization)
537
+ description: Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
538
+ metric_groups:
539
+ - accuracy
540
+ - efficiency
541
+ - general_information
542
+ environment:
543
+ main_name: exact_match
544
+ main_split: valid
545
+ taxonomy:
546
+ task: image captioning
547
+ what: Real world images
548
+ who: Human experts
549
+ when: "2014"
550
+ language: English
551
+
552
+ - name: viz_wiz
553
+ display_name: VizWiz
554
+ description: A benchmark for visual question answering with images and questions created by visually impaired people [(Gurari et al., 2018)](https://arxiv.org/abs/1802.08218).
555
+ metric_groups:
556
+ - accuracy
557
+ - efficiency
558
+ - general_information
559
+ environment:
560
+ main_name: f1_score
561
+ main_split: valid
562
+ taxonomy:
563
+ task: multimodal short answer question answering
564
+ what: Real-world images
565
+ who: Visually impaired people
566
+ when: "2018"
567
+ language: English
568
+
569
+ - name: vqa
570
+ display_name: VQAv2
571
+ description: Open-ended questions about real-world images [(Goyal et al., 2017)](https://arxiv.org/abs/1612.00837).
572
+ metric_groups:
573
+ - accuracy
574
+ - efficiency
575
+ - general_information
576
+ environment:
577
+ main_name: f1_score
578
+ main_split: valid
579
+ taxonomy:
580
+ task: multimodal short answer question answering
581
+ what: Real-world images
582
+ who: Human experts
583
+ when: "2017"
584
+ language: English
585
+
586
+ - name: math_vista
587
+ display_name: MathVista
588
+ description: Evaluating Math Reasoning in Visual Contexts
589
+ metric_groups:
590
+ - accuracy
591
+ - efficiency
592
+ - general_information
593
+ environment:
594
+ main_name: exact_match
595
+ main_split: test
596
+ taxonomy:
597
+ task: multiple-choice question answering
598
+ what: Evaluating Math Reasoning in Visual Contexts
599
+ who: Human experts
600
+ when: "2024"
601
+ language: English
602
+
603
+ - name: mmmu
604
+ display_name: MMMU
605
+ description: A benchmark designed to evaluate multimodal models on massive multi-discipline tasks demanding college-level subject knowledge and deliberate reasoning [(Yue et al., 2023)](https://arxiv.org/abs/2311.16502).
606
+ metric_groups:
607
+ - accuracy
608
+ - efficiency
609
+ - general_information
610
+ environment:
611
+ main_name: exact_match
612
+ main_split: valid
613
+ taxonomy:
614
+ task: multimodal multiple-choice question answering
615
+ what: Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering
616
+ who: Human experts
617
+ when: "2023"
618
+ language: English
619
+
620
+ - name: unicorn
621
+ display_name: Unicorn
622
+ description: Safety Evaluation Benchmark for Evaluating on Out-of-Distribution and Sketch Images
623
+ metric_groups:
624
+ - accuracy
625
+ - general_information
626
+ environment:
627
+ main_name: exact_match
628
+ main_split: test
629
+ taxonomy:
630
+ task: short answer question answering
631
+ what: OOD images and sketch images
632
+ who: Human experts
633
+ when: "2023"
634
+ language: English
635
+
636
+ - name: bingo
637
+ display_name: Bingo
638
+ description: Open-ended questions about biased images
639
+ metric_groups:
640
+ - accuracy
641
+ environment:
642
+ main_name: f1_score
643
+ main_split: test
644
+ taxonomy:
645
+ task: short answer question answering
646
+ what: Biased images about Region, OCR, Factual, Text-to-Image and Image-to-Image inference challenges
647
+ who: Human experts
648
+ when: "2023"
649
+ language: English, Chinese, Japanese, etc.
650
+
651
+ - name: multipanelvqa
652
+ display_name: MultipanelVQA
653
+ description: Question about real-world or synthetic multipanel images for evaluating multi-panel image reasoning ability
654
+ metric_groups:
655
+ - accuracy
656
+ - efficiency
657
+ - general_information
658
+ environment:
659
+ main_name: exact_match
660
+ main_split: test
661
+ taxonomy:
662
+ task: short answer or multiple-choice question answering
663
+ what: Real-world or synthetic multipanel images
664
+ who: Human experts
665
+ when: "2024"
666
+ language: English
667
+
668
+ - name: pope
669
+ display_name: POPE
670
+ description: Open-ended questions about object appearance in real-world images for evaluating hallucination behaviour
671
+ metric_groups:
672
+ - accuracy
673
+ - efficiency
674
+ - general_information
675
+ environment:
676
+ main_name: exact_match
677
+ main_split: test
678
+ taxonomy:
679
+ task: short answer question answering
680
+ what: Real-world images
681
+ who: Human experts
682
+ when: "2023"
683
+ language: English
684
+
685
+ - name: seed_bench
686
+ display_name: Seed Bench
687
+ description: A massive multiple-choice question-answering benchmark that spans 9 evaluation aspects with the image input
688
+ including the comprehension of both the image and video modality
689
+ metric_groups:
690
+ - accuracy
691
+ - efficiency
692
+ - general_information
693
+ environment:
694
+ main_name: exact_match
695
+ main_split: test
696
+ taxonomy:
697
+ task: multiple-choice question answering
698
+ what: Real-world images
699
+ who: Human experts
700
+ when: "2023"
701
+ language: English
702
+
703
+ - name: mme
704
+ display_name: MME
705
+ description: A comprehensive MLLM Evaluation benchmark with perception and cognition evaluations on 14 subtasks
706
+ metric_groups:
707
+ - accuracy
708
+ - efficiency
709
+ - general_information
710
+ environment:
711
+ main_name: exact_match
712
+ main_split: test
713
+ taxonomy:
714
+ task: multiple-choice question answering
715
+ what: Real-world images
716
+ who: Human experts
717
+ when: "2023"
718
+ language: English
719
+
720
+ - name: mementos
721
+ display_name: Mementos
722
+ description: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
723
+ metric_groups:
724
+ - accuracy
725
+ environment:
726
+ main_name: f1_score
727
+ main_split: test
728
+ taxonomy:
729
+ task: short answer question answering
730
+ what: Image sequences of comics, dailylife and robotics
731
+ who: Human experts
732
+ when: "2024"
733
+ language: English
734
+
735
+ - name: image2latex
736
+ display_name: Image2LaTeX
737
+ description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
738
+ metric_groups:
739
+ - accuracy
740
+ - generation_image
741
+ - generation_text
742
+ - efficiency
743
+ - general_information
744
+ environment:
745
+ main_name: earth_mover_similarity
746
+ main_split: valid
747
+ taxonomy:
748
+ task: image-to-text
749
+ what: mathematical equations, tables, algorithms, tikz
750
+ who: n/a
751
+ when: "2024"
752
+ language: English
753
+
754
+ - name: image2webpage
755
+ display_name: Image2webpage
756
+ description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
757
+ metric_groups:
758
+ - accuracy
759
+ - generation_image
760
+ - generation_text
761
+ - efficiency
762
+ - general_information
763
+ environment:
764
+ main_name: earth_mover_similarity
765
+ main_split: valid
766
+ taxonomy:
767
+ task: image-to-text
768
+ what: css, html, javascript
769
+ who: n/a
770
+ when: "2024"
771
+ language: English
772
+
773
+ - name: image2musicsheet
774
+ display_name: Image2musicsheet
775
+ description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
776
+ metric_groups:
777
+ - accuracy
778
+ - generation_image
779
+ - efficiency
780
+ - general_information
781
+ environment:
782
+ main_name: earth_mover_similarity
783
+ main_split: valid
784
+ taxonomy:
785
+ task: image-to-text
786
+ what: music sheets
787
+ who: n/a
788
+ when: "2024"
789
+ language: English
790
+
791
+ - name: chart2csv
792
+ display_name: Chart2CSV
793
+ description: The Chart2CSV benchmark for converting images of charts to CSV.
794
+ metric_groups:
795
+ - accuracy
796
+ - efficiency
797
+ - general_information
798
+ environment:
799
+ main_name: exact_match
800
+ main_split: test
801
+ taxonomy:
802
+ task: chart to CSV
803
+ what: plots
804
+ who: n/a
805
+ when: "2024"
806
+ language: English
807
+
808
+ - name: pairs
809
+ display_name: PAIRS
810
+ description: Examining Gender and Racial Bias in Large Vision-Language Models Using a Novel Dataset of Parallel Images.
811
+ metric_groups:
812
+ - accuracy
813
+ - efficiency
814
+ - general_information
815
+ environment:
816
+ main_name: exact_match
817
+ main_split: test
818
+ taxonomy:
819
+ task: multiple-choice question answering
820
+ what: Bias
821
+ who: Human experts
822
+ when: "2024"
823
+ language: English