crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,824 @@
1
+ ---
2
+ ############################################################
3
+ adapter:
4
+ - name: method
5
+ description: The high-level strategy for converting instances into a prompt for the language model.
6
+ values:
7
+ - name: generation
8
+ description: Given the input, the model generates the output free-form.
9
+ - name: multiple_choice_joint
10
+ description: Given the input, the model selects from multiple-choice options (A., B., C., D., E.).
11
+ - name: multiple_choice_separate_original
12
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability.
13
+ - name: multiple_choice_separate_calibrated
14
+ description: For each answer choice, the model assigns the input and answer choice a probability, returning the answer with maximum probability when calibrated by answer choice probability.
15
+ - name: language_modeling
16
+ description: Given the input, the model assigns the sequence a probability.
17
+ - name: instructions
18
+ description: The description of the task that is included at the very beginning of the prompt.
19
+ - name: global_prefix
20
+ description: The string that is prepended to the prompt.
21
+ - name: global_suffix
22
+ description: The string that is appended to the prompt.
23
+ - name: instance_prefix
24
+ description: The string that is included before each instance (e.g., '\n\n').
25
+ - name: input_prefix
26
+ description: The string that is included before each input (e.g., 'Question:').
27
+ - name: input_suffix
28
+ description: The string that is included after each input (e.g., '\n').
29
+ - name: reference_prefix
30
+ description: The string that is included before each reference (for multiple-choice questions).
31
+ - name: reference_suffix
32
+ description: The string that is included after each reference (for multiple-choice questions).
33
+ - name: output_prefix
34
+ description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
35
+ - name: output_suffix
36
+ description: The string that is included after the correct answer/predicted output (e.g., '\n').
37
+ - name: substitutions
38
+ description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
39
+ - name: max_train_instances
40
+ description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
41
+ - name: max_eval_instances
42
+ description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
43
+ - name: num_outputs
44
+ description: Maximum number of possible outputs to generate by sampling multiple outputs.
45
+ - name: num_train_trials
46
+ description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
47
+ - name: sample_train
48
+ description: If true, randomly sample N training examples; if false, select N consecutive training examples
49
+ - name: model
50
+ description: Name of the language model (<creator_organization>/<model name>) to send requests to.
51
+ - name: model_deployment
52
+ description: Name of the language model deployment (<host_organization>/<model name>) to send requests to.
53
+ - name: temperature
54
+ description: Temperature parameter used in generation.
55
+ - name: max_tokens
56
+ description: Maximum number of tokens to generate.
57
+ - name: stop_sequences
58
+ description: List of sequences, where we stop generation if we encounter any of them.
59
+ - name: random
60
+ description: Random seed (string), which guarantees reproducibility.
61
+ - name: multi_label
62
+ description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
63
+
64
+ ############################################################
65
+ metrics:
66
+ # Infrastructure metrics:
67
+ - name: num_perplexity_tokens
68
+ display_name: '# tokens'
69
+ description: Average number of tokens in the predicted output (for language modeling, the input too).
70
+ - name: num_bytes
71
+ display_name: '# bytes'
72
+ description: Average number of bytes in the predicted output (for language modeling, the input too).
73
+
74
+ - name: num_references
75
+ display_name: '# ref'
76
+ description: Number of references.
77
+ - name: num_train_trials
78
+ display_name: '# trials'
79
+ description: Number of trials, where in each trial we choose an independent, random set of training instances.
80
+ - name: estimated_num_tokens_cost
81
+ display_name: 'cost'
82
+ description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
83
+ - name: num_prompt_tokens
84
+ display_name: '# prompt tokens'
85
+ description: Number of tokens in the prompt.
86
+ - name: num_prompt_characters
87
+ display_name: '# prompt chars'
88
+ description: Number of characters in the prompt.
89
+ - name: num_completion_tokens
90
+ display_name: '# completion tokens'
91
+ description: Actual number of completion tokens (over all completions).
92
+ - name: num_output_tokens
93
+ display_name: '# output tokens'
94
+ description: Actual number of output tokens.
95
+ - name: max_num_output_tokens
96
+ display_name: 'Max output tokens'
97
+ description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
98
+ - name: num_requests
99
+ display_name: '# requests'
100
+ description: Number of distinct API requests.
101
+ - name: num_instances
102
+ display_name: '# eval'
103
+ description: Number of evaluation instances.
104
+ - name: num_train_instances
105
+ display_name: '# train'
106
+ description: Number of training instances (e.g., in-context examples).
107
+ - name: prompt_truncated
108
+ display_name: truncated
109
+ description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
110
+ - name: finish_reason_length
111
+ display_name: finish b/c length
112
+ description: Fraction of instances where the the output was terminated because of the max tokens limit.
113
+ - name: finish_reason_stop
114
+ display_name: finish b/c stop
115
+ description: Fraction of instances where the the output was terminated because of the stop sequences.
116
+ - name: finish_reason_endoftext
117
+ display_name: finish b/c endoftext
118
+ description: Fraction of instances where the the output was terminated because the end of text token was generated.
119
+ - name: finish_reason_unknown
120
+ display_name: finish b/c unknown
121
+ description: Fraction of instances where the the output was terminated for unknown reasons.
122
+ - name: num_completions
123
+ display_name: '# completions'
124
+ description: Number of completions.
125
+ - name: predicted_index
126
+ display_name: Predicted index
127
+ description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
128
+
129
+ # Accuracy metrics:
130
+ - name: exact_match
131
+ display_name: Exact match
132
+ short_display_name: EM
133
+ description: Fraction of instances that the predicted output matches a correct reference exactly.
134
+ lower_is_better: false
135
+ - name: quasi_exact_match
136
+ display_name: Quasi-exact match
137
+ short_display_name: EM
138
+ description: Fraction of instances that the predicted output matches a correct reference up to light processing.
139
+ lower_is_better: false
140
+ - name: prefix_exact_match
141
+ display_name: Prefix exact match
142
+ short_display_name: PEM
143
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
144
+ lower_is_better: false
145
+ - name: quasi_prefix_exact_match
146
+ # TODO: should call this prefix_quasi_exact_match
147
+ display_name: Prefix quasi-exact match
148
+ short_display_name: PEM
149
+ description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
150
+ lower_is_better: false
151
+
152
+ - name: exact_match@5
153
+ display_name: Exact match @5
154
+ short_display_name: EM@5
155
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference exactly.
156
+ lower_is_better: false
157
+ - name: quasi_exact_match@5
158
+ display_name: Quasi-exact match @5
159
+ short_display_name: EM@5
160
+ description: Fraction of instances where at least one predicted output among the top 5 matches a correct reference up to light processing.
161
+ lower_is_better: false
162
+ - name: prefix_exact_match@5
163
+ display_name: Prefix exact match @5
164
+ short_display_name: PEM@5
165
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference exactly.
166
+ lower_is_better: false
167
+ - name: quasi_prefix_exact_match@5
168
+ display_name: Prefix quasi-exact match @5
169
+ short_display_name: PEM@5
170
+ description: Fraction of instances that the predicted output among the top 5 matches the prefix of a correct reference up to light processing.
171
+ lower_is_better: false
172
+
173
+ - name: logprob
174
+ display_name: Log probability
175
+ short_display_name: Logprob
176
+ description: Predicted output's average log probability (input's log prob for language modeling).
177
+ lower_is_better: false
178
+ - name: logprob_per_byte
179
+ display_name: Log probability / byte
180
+ short_display_name: Logprob/byte
181
+ description: Predicted output's average log probability normalized by the number of bytes.
182
+ lower_is_better: false
183
+ - name: bits_per_byte
184
+ display_name: Bits/byte
185
+ short_display_name: BPB
186
+ lower_is_better: true
187
+ description: Average number of bits per byte according to model probabilities.
188
+ - name: perplexity
189
+ display_name: Perplexity
190
+ short_display_name: PPL
191
+ lower_is_better: true
192
+ description: Perplexity of the output completion (effective branching factor per output token).
193
+ - name: rouge_1
194
+ display_name: ROUGE-1
195
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
196
+ lower_is_better: false
197
+ - name: rouge_2
198
+ display_name: ROUGE-2
199
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
200
+ lower_is_better: false
201
+ - name: rouge_l
202
+ display_name: ROUGE-L
203
+ description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
204
+ lower_is_better: false
205
+ - name: bleu_1
206
+ display_name: BLEU-1
207
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 1-gram overlap.
208
+ lower_is_better: false
209
+ - name: bleu_4
210
+ display_name: BLEU-4
211
+ description: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.
212
+ lower_is_better: false
213
+ - name: f1_set_match
214
+ display_name: F1 (set match)
215
+ short_display_name: F1
216
+ description: Average F1 score in terms of set overlap between the model predicted set and correct reference set.
217
+ lower_is_better: false
218
+ - name: f1_score
219
+ display_name: F1
220
+ description: Average F1 score in terms of word overlap between the model output and correct reference.
221
+ lower_is_better: false
222
+ - name: classification_macro_f1
223
+ display_name: Macro-F1
224
+ description: Population-level macro-averaged F1 score.
225
+ lower_is_better: false
226
+ - name: classification_micro_f1
227
+ display_name: Micro-F1
228
+ description: Population-level micro-averaged F1 score.
229
+ lower_is_better: false
230
+ - name: absolute_value_difference
231
+ display_name: Absolute difference
232
+ short_display_name: Diff.
233
+ lower_is_better: true
234
+ description: Average absolute difference between the model output (converted to a number) and the correct reference.
235
+ - name: distance
236
+ display_name: Geometric distance
237
+ short_display_name: Dist.
238
+ lower_is_better: true
239
+ description: Average gometric distance between the model output (as a point) and the correct reference (as a curve).
240
+ - name: percent_valid
241
+ display_name: Valid fraction
242
+ short_display_name: Valid
243
+ description: Fraction of valid model outputs (as a number).
244
+ lower_is_better: false
245
+ - name: NDCG@10
246
+ display_name: NDCG@10
247
+ description: Normalized discounted cumulative gain at 10 in information retrieval.
248
+ lower_is_better: false
249
+ - name: RR@10
250
+ display_name: RR@10
251
+ description: Mean reciprocal rank at 10 in information retrieval.
252
+ lower_is_better: false
253
+ - name: NDCG@20
254
+ display_name: NDCG@20
255
+ description: Normalized discounted cumulative gain at 20 in information retrieval.
256
+ lower_is_better: false
257
+ - name: RR@20
258
+ display_name: RR@20
259
+ description: Mean reciprocal rank at 20 in information retrieval.
260
+ lower_is_better: false
261
+ - name: math_equiv
262
+ display_name: Equivalent
263
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference.
264
+ lower_is_better: false
265
+ - name: math_equiv_chain_of_thought
266
+ display_name: Equivalent (CoT)
267
+ description: Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.
268
+ lower_is_better: false
269
+ - name: exact_match_indicator
270
+ display_name: Exact match (final)
271
+ short_display_name: EM
272
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator (e.g., space).
273
+ lower_is_better: false
274
+ - name: final_number_exact_match
275
+ display_name: Exact match (final number)
276
+ short_display_name: EM
277
+ description: Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.
278
+ lower_is_better: false
279
+ - name: exact_set_match
280
+ display_name: Exact match (at sets)
281
+ short_display_name: EM
282
+ description: Fraction of instances that the predicted output matches a correct reference exactly as sets.
283
+ lower_is_better: false
284
+ - name: iou_set_match
285
+ display_name: Intersection over union (as sets)
286
+ short_display_name: IoU
287
+ description: Intersection over union in terms of set overlap between the model predicted set and correct reference set.
288
+ lower_is_better: false
289
+
290
+ # Summarization metrics
291
+ - name: summac
292
+ display_name: SummaC
293
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
294
+ lower_is_better: false
295
+ - name: QAFactEval
296
+ display_name: QAFactEval
297
+ description: Faithfulness scores based on the SummaC method of [Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/).
298
+ lower_is_better: false
299
+ - name: summarization_coverage
300
+ display_name: Coverage
301
+ description: Extent to which the model-generated summaries are extractive fragments from the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
302
+ - name: summarization_density
303
+ display_name: Density
304
+ description: Extent to which the model-generated summaries are extractive summaries based on the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
305
+ - name: summarization_compression
306
+ display_name: Compression
307
+ description: Extent to which the model-generated summaries are compressed relative to the source document [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/).
308
+ - name: BERTScore-P
309
+ display_name: BERTScore (P)
310
+ description: Average BERTScore precision [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
311
+ lower_is_better: false
312
+ - name: BERTScore-R
313
+ display_name: BERTScore (R)
314
+ description: Average BERTScore recall [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
315
+ lower_is_better: false
316
+ - name: BERTScore-F
317
+ display_name: BERTScore (F1)
318
+ description: Average BERTScore F1 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr) between model generation and reference summary.
319
+ lower_is_better: false
320
+ - name: HumanEval-faithfulness
321
+ display_name: HumanEval-faithfulness
322
+ description: Human evaluation score for faithfulness.
323
+ lower_is_better: false
324
+ - name: HumanEval-relevance
325
+ display_name: HumanEval-relevance
326
+ description: Human evaluation score for relevance.
327
+ lower_is_better: false
328
+ - name: HumanEval-coherence
329
+ display_name: HumanEval-coherence
330
+ description: Human evaluation score for coherence.
331
+ lower_is_better: false
332
+
333
+ # Code metrics
334
+ - name: code_eval_acc
335
+ display_name: Correctness
336
+ short_display_name: Correctness
337
+ description: Fraction of instances that the model output evaluates to the correct answer.
338
+ lower_is_better: false
339
+ - name: pass
340
+ display_name: pass@1
341
+ description: Fraction of model outputs that pass the associated test cases.
342
+ lower_is_better: false
343
+ - name: test_avg
344
+ display_name: 'Avg. # tests passed'
345
+ description: Average number of tests passed by model outputs.
346
+ lower_is_better: false
347
+ - name: strict_acc
348
+ display_name: Strict correctness
349
+ short_display_name: Strict correctness
350
+ description: Fraction of models outputs that pass all associated test cases.
351
+ lower_is_better: false
352
+
353
+ # Disinformation metrics (measure diversity):
354
+ - name: self_bleu
355
+ display_name: Self-BLEU
356
+ description: Self-BLEU metric of [Zhu et al. (2018)](https://arxiv.org/pdf/1802.01886.pdf) as a measure of diversity in model generations.
357
+ - name: monte_carlo_entropy
358
+ display_name: Entropy (Monte Carlo)
359
+ short_display_name: Entropy
360
+ description: Monte Carlo estimate of the entropy as a measure of diversity in model generations.
361
+ # Disinformation wedging metrics (measure quality of disinformation):
362
+ - name: wedging_eval_address_intended_audience
363
+ display_name: Address intended audience (human)
364
+ short_display_name: Intended audience (human)
365
+ description: Human judgment of fraction of model generations that target the intended audience.
366
+ - name: wedging_eval_include_intended_goal
367
+ display_name: Include intended goal (human)
368
+ short_display_name: Intended goal (human)
369
+ description: Human judgment of fraction of model generations that support the intended goal.
370
+ - name: wedging_eval_style
371
+ display_name: Correct style (human)
372
+ short_display_name: Style (human)
373
+ description: Average human judgment of whether model generations are of the specified style.
374
+ - name: wedging_eval_divisiveness
375
+ display_name: Divisiveness (human)
376
+ description: Average human judgment of the extent to which model generations are divisive.
377
+ - name: wedging_eval_toxicity
378
+ display_name: Toxicity (human)
379
+ description: Average human judgment of the extent to which model generations are toxicity.
380
+ # Disinformation reiteration metrics (measure quality of disinformation):
381
+ - name: reiteration_eval_support_thesis
382
+ display_name: Support thesis (human)
383
+ description: Average human judgment of the extent to which model generations support the stated thesis.
384
+ - name: reiteration_eval_style
385
+ display_name: Style (human)
386
+ description: Average human judgment of whether model generations are of the specified style.
387
+
388
+ # Copyright metrics (measure copying/overlap):
389
+ - name: longest_common_prefix_length
390
+ display_name: Longest common prefix length
391
+ short_display_name: LCS
392
+ lower_is_better: true
393
+ description: Average length of longest common prefix between model generation and reference.
394
+ - name: edit_distance
395
+ display_name: Edit distance (Levenshtein)
396
+ short_display_name: Edit dist.
397
+ description: Average Levenshtein edit distance between model generation and reference.
398
+ lower_is_better: false
399
+ - name: edit_similarity
400
+ display_name: Edit similarity (Levenshtein)
401
+ short_display_name: Edit sim.
402
+ lower_is_better: true
403
+ description: Average Levenshtein edit similarity (1 - distance normalized by length of longer sequence) between model generation and reference.
404
+
405
+ # Bias metrics:
406
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=profession
407
+ display_name: Stereotypical associations (race, profession)
408
+ short_display_name: Stereotypes (race)
409
+ lower_is_better: true
410
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target professions. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
411
+ - name: bias_metric:mode=associations,demographic_category=race,target_category=adjective
412
+ display_name: Stereotypical associations (race, adjectives)
413
+ short_display_name: Stereotypes (race)
414
+ lower_is_better: true
415
+ description: Measures uneven association of racial groups (Asian, Hispanic, White) with target adjectives. This measurement is based on cooccurence statistics between the racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
416
+
417
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=profession
418
+ display_name: Stereotypical associations (gender, profession)
419
+ short_display_name: Stereotypes (gender)
420
+ lower_is_better: true
421
+ description: Measures uneven association of gender groups (male, female) with target professions. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target professions (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
422
+ - name: bias_metric:mode=associations,demographic_category=gender,target_category=adjective
423
+ display_name: Stereotypical associations (gender, adjectives)
424
+ short_display_name: Stereotypes (gender)
425
+ lower_is_better: true
426
+ description: Measures uneven association of gender groups (male, female) with target adjectives. This measurement is based on cooccurence statistics between the gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)) and the target adjectives (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
427
+
428
+ - name: bias_metric:mode=representation,demographic_category=race
429
+ display_name: Demographic representation (race)
430
+ short_display_name: Representation (race)
431
+ lower_is_better: true
432
+ description: Measures uneven representation of racial groups (Asian, Hispanic, White). This measurement is based on disparities in the frequency statistics across racially-associated names (word list from [Garg et al., 2018](https://www.pnas.org/doi/10.1073/pnas.1720347115); race associations based on US Census statistics).
433
+ - name: bias_metric:mode=representation,demographic_category=gender
434
+ display_name: Demographic representation (gender)
435
+ short_display_name: Representation (gender)
436
+ lower_is_better: true
437
+ description: Measures uneven representation of gender groups (male, female). This measurement is based on disparities in the frequency statistics across gender terms (word list from [Bolukbasi et al., 2016](https://papers.nips.cc/paper/2016/hash/a486cd07e4ac3d270571622f4f316ec5-Abstract.html)).
438
+
439
+ - name: bbq_metric_ambiguous_bias
440
+ display_name: BBQ (ambiguous)
441
+ lower_is_better: true
442
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on ambiguous examples.
443
+ - name: bbq_metric_unambiguous_bias
444
+ display_name: BBQ (unambiguous)
445
+ lower_is_better: true
446
+ description: Metric of [Parrish et al. (2022)](https://aclanthology.org/2022.findings-acl.165/) for BBQ on unambiguous examples.
447
+
448
+ # Toxicity metrics
449
+ - name: expected_max_toxicity
450
+ display_name: Average max toxicity
451
+ short_display_name: Avg. max toxicity
452
+ lower_is_better: true
453
+ description: Average max toxicity score across all completions (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
454
+ - name: max_toxicity_probability
455
+ display_name: max toxicity
456
+ lower_is_better: true
457
+ description: Fraction of instances where most toxic completion is above 0.5 toxicity (based on the PerspectiveAPI toxicity classifier; [Gehman et al. (2020)](https://aclanthology.org/2020.findings-emnlp.301/)).
458
+ - name: toxic_frac
459
+ display_name: Toxic fraction
460
+ lower_is_better: true
461
+ description: Fraction of model outputs that are toxic (based on the PerspectiveAPI toxicity classifier).
462
+
463
+ # Efficiency metrics:
464
+ - name: training_co2_cost
465
+ display_name: Estimated training emissions (kg CO2)
466
+ short_display_name: Training emissions (kg CO2)
467
+ lower_is_better: true
468
+ description: Estimate of the CO2 emissions from training the model.
469
+ - name: training_energy_cost
470
+ display_name: Estimated training energy cost (MWh)
471
+ short_display_name: Training energy (MWh)
472
+ lower_is_better: true
473
+ description: Estimate of the amount of energy used to train the model.
474
+ - name: inference_runtime
475
+ display_name: Observed inference runtime (s)
476
+ short_display_name: Observed inference time (s)
477
+ lower_is_better: true
478
+ description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
479
+ - name: inference_idealized_runtime
480
+ display_name: Idealized inference runtime (s)
481
+ short_display_name: Idealized inference time (s)
482
+ lower_is_better: true
483
+ description: Average time to process a request to the model based solely on the model architecture (using Megatron-LM).
484
+ - name: inference_denoised_runtime
485
+ display_name: Denoised inference runtime (s)
486
+ short_display_name: Denoised inference time (s)
487
+ lower_is_better: true
488
+ description: Average time to process a request to the model minus performance contention by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
489
+ - name: batch_size
490
+ display_name: Batch size
491
+ description: For batch jobs, how many requests are in a batch.
492
+
493
+ # Calibration metrics:
494
+ - name: ece_1_bin
495
+ display_name: 1-bin expected calibration error
496
+ short_display_name: ECE (1-bin)
497
+ lower_is_better: true
498
+ description: The (absolute value) difference between the model's average confidence and accuracy (only computed for classification tasks).
499
+ - name: max_prob
500
+ display_name: Max prob
501
+ description: Model's average confidence in its prediction (only computed for classification tasks)
502
+ lower_is_better: false
503
+ - name: ece_10_bin
504
+ display_name: 10-bin expected calibration error
505
+ short_display_name: ECE (10-bin)
506
+ lower_is_better: true
507
+ description: The average difference between the model's confidence and accuracy, averaged across 10 bins where each bin contains an equal number of points (only computed for classification tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because each bin will have very few examples.
508
+ - name: platt_ece_1_bin
509
+ display_name: 1-bin expected calibration error (after Platt scaling)
510
+ short_display_name: Platt-scaled ECE (1-bin)
511
+ lower_is_better: true
512
+ description: 1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
513
+ - name: platt_ece_10_bin
514
+ display_name: 10-bin Expected Calibration Error (after Platt scaling)
515
+ short_display_name: Platt-scaled ECE (10-bin)
516
+ lower_is_better: true
517
+ description: 10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted probabilities.
518
+ - name: platt_coef
519
+ display_name: Platt Scaling Coefficient
520
+ short_display_name: Platt Coef
521
+ description: Coefficient of the Platt scaling classifier (can compare this across tasks).
522
+ lower_is_better: false
523
+ - name: platt_intercept
524
+ display_name: Platt Scaling Intercept
525
+ short_display_name: Platt Intercept
526
+ description: Intercept of the Platt scaling classifier (can compare this across tasks).
527
+ lower_is_better: false
528
+ - name: selective_cov_acc_area
529
+ display_name: Selective coverage-accuracy area
530
+ short_display_name: Selective Acc
531
+ description: The area under the coverage-accuracy curve, a standard selective classification metric (only computed for classification tasks).
532
+ lower_is_better: false
533
+ - name: selective_acc@10
534
+ display_name: Accuracy at 10% coverage
535
+ short_display_name: Acc@10%
536
+ description: The accuracy for the 10% of predictions that the model is most confident on (only computed for classification tasks).
537
+ lower_is_better: false
538
+
539
+ # CLEVA (Chinese) metrics:
540
+ # Accuracy metrics (Chinese)
541
+ - name: chinese_ibleu
542
+ display_name: Chinese iBLEU
543
+ short_display_name: iBLEU (Chinese)
544
+ description: A special BLEU score [(Sun and Zhou, 2008)](https://aclanthology.org/P12-2008.pdf) that balances the lexical similarity between references and hypotheses as well as the lexical diversity between raw inputs and hypotheses.
545
+ lower_is_better: false
546
+ - name: cleva_top1_accuracy
547
+ display_name: Chinese Top-1 Accuracy
548
+ short_display_name: Acc@Top-1 (Chinese)
549
+ description: A special accuracy [(Patel and Pavlick, 2022)](https://openreview.net/pdf?id=gJcEM8sxHK) that gives perfect precision as long as a substring of the answer appears in the most confident model prediction.
550
+ lower_is_better: false
551
+ - name: cleva_machine_translation_bleu
552
+ display_name: BLEU
553
+ short_display_name: BLEU
554
+ description: BLEU score based on [Post, (2018)](https://aclanthology.org/W18-6319/).
555
+ lower_is_better: false
556
+ - name: chinese_rouge_2
557
+ display_name: Chinese ROUGE-2 score
558
+ short_display_name: ROUGE-2 (Chinese)
559
+ description: ROUGE-2 score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on a Chinese tokenizer that segments Chinese strings by character.
560
+ lower_is_better: false
561
+ - name: chinese_bleu_1
562
+ display_name: Chinese BLEU-1 score
563
+ short_display_name: BLEU-1 (Chinese)
564
+ description: BLEU-1 score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on a Chinese tokenizer that segments Chinese strings by character.
565
+ lower_is_better: false
566
+ - name: cleva_math_result_match
567
+ display_name: CLEVA Math Exact Match
568
+ short_display_name: EM (Math)
569
+ description: Exact match that cares only the last math expression (numbers and fractions) in the model's prediction.
570
+ lower_is_better: false
571
+ # CLEVA Chinese bias, copyright and toxicity metrics share the same name as the original HELM metrics
572
+
573
+ ############################################################
574
+ perturbations:
575
+ - name: robustness
576
+ display_name: Robustness
577
+ description: Computes worst case over different robustness perturbations (misspellings, formatting, contrast sets).
578
+ - name: fairness
579
+ display_name: Fairness
580
+ description: Computes worst case over different fairness perturbations (changing dialect, race of names, gender).
581
+ - name: typos
582
+ display_name: Typos
583
+ description: >
584
+ Randomly adds typos to each token in the input with probability 0.05 and computes the per-instance worst-case
585
+ performance between perturbed and unperturbed versions.
586
+ - name: synonym
587
+ display_name: Synonyms
588
+ description: >
589
+ Randomly substitutes words in the input with WordNet synonyms with probability 0.5 and computes the per-instance
590
+ worst-case performance between perturbed and unperturbed versions.
591
+ - name: dialect
592
+ display_name: SAE -> AAE
593
+ short_display_name: Dialect
594
+ description: >
595
+ Deterministically substitutes SAE words in input with AAE counterparts using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
596
+ - name: race
597
+ display_name: First names by race (White -> Black)
598
+ short_display_name: Race
599
+ description: >
600
+ Deterministically substitutes White first names with Black first names sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230) and computes the per-instance worst-case performance between perturbed and unperturbed versions.
601
+ - name: gender
602
+ display_name: Pronouns by gender (Male -> Female)
603
+ short_display_name: Gender
604
+ description: >
605
+ Deterministically substitutes male pronouns with female pronouns and computes the per-instance worst-case
606
+ performance between perturbed and unperturbed versions.
607
+
608
+ ############################################################
609
+ metric_groups:
610
+ - name: accuracy
611
+ display_name: Accuracy
612
+ metrics:
613
+ - name: ${main_name}
614
+ split: ${main_split}
615
+
616
+ - name: efficiency
617
+ display_name: Efficiency
618
+ metrics:
619
+ - name: inference_runtime
620
+ split: ${main_split}
621
+
622
+ - name: general_information
623
+ display_name: General information
624
+ metrics:
625
+ - name: num_instances
626
+ split: ${main_split}
627
+ - name: num_train_instances
628
+ split: ${main_split}
629
+ - name: prompt_truncated
630
+ split: ${main_split}
631
+ - name: num_prompt_tokens
632
+ split: ${main_split}
633
+ - name: num_output_tokens
634
+ split: ${main_split}
635
+
636
+ ############################################################
637
+ run_groups:
638
+ - name: core_scenarios
639
+ display_name: Core scenarios
640
+ description: The scenarios where we evaluate all the models.
641
+ category: All scenarios
642
+ subgroups:
643
+ - narrative_qa
644
+ - natural_qa_openbook_longans
645
+ - natural_qa_closedbook
646
+ - openbookqa
647
+ - mmlu
648
+ - math_chain_of_thought
649
+ - gsm
650
+ - legalbench
651
+ - med_qa
652
+ - wmt_14
653
+
654
+ - name: narrative_qa
655
+ display_name: NarrativeQA
656
+ description: The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský et al., 2017)](https://aclanthology.org/Q18-1023/).
657
+ metric_groups:
658
+ - accuracy
659
+ - efficiency
660
+ - general_information
661
+ environment:
662
+ main_name: f1_score
663
+ main_split: test
664
+ taxonomy:
665
+ task: short-answer question answering
666
+ what: passages are books and movie scripts, questions are unknown
667
+ who: annotators from summaries
668
+ when: "2018"
669
+ language: English
670
+
671
+ - name: natural_qa_closedbook
672
+ display_name: NaturalQuestions (closed-book)
673
+ description: The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.
674
+ metric_groups:
675
+ - accuracy
676
+ - efficiency
677
+ - general_information
678
+ environment:
679
+ main_name: f1_score
680
+ main_split: valid
681
+ taxonomy:
682
+ task: short-answer question answering
683
+ what: passages from Wikipedia, questions from search queries
684
+ who: web users
685
+ when: 2010s
686
+ language: English
687
+
688
+ - name: natural_qa_openbook_longans
689
+ display_name: NaturalQuestions (open-book)
690
+ description: The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.
691
+ metric_groups:
692
+ - accuracy
693
+ - efficiency
694
+ - general_information
695
+ environment:
696
+ main_name: f1_score
697
+ main_split: valid
698
+ taxonomy:
699
+ task: short-answer question answering
700
+ what: passages from Wikipedia, questions from search queries
701
+ who: web users
702
+ when: 2010s
703
+ language: English
704
+
705
+ - name: openbookqa
706
+ display_name: OpenbookQA
707
+ description: The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).
708
+ metric_groups:
709
+ - accuracy
710
+ - efficiency
711
+ - general_information
712
+ environment:
713
+ main_name: exact_match
714
+ main_split: test
715
+ taxonomy:
716
+ task: multiple-choice question answering
717
+ what: elementary science
718
+ who: Amazon Mechnical Turk workers
719
+ when: "2018"
720
+ language: English
721
+
722
+ - name: mmlu
723
+ display_name: MMLU (Massive Multitask Language Understanding)
724
+ short_display_name: MMLU
725
+ description: The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).
726
+ metric_groups:
727
+ - accuracy
728
+ - efficiency
729
+ - general_information
730
+ environment:
731
+ main_name: exact_match
732
+ main_split: test
733
+ taxonomy:
734
+ task: multiple-choice question answering
735
+ what: math, science, history, etc.
736
+ who: various online sources
737
+ when: before 2021
738
+ language: English
739
+
740
+ - name: gsm
741
+ display_name: GSM8K (Grade School Math)
742
+ short_display_name: GSM8K
743
+ description: The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).
744
+ metric_groups:
745
+ - accuracy
746
+ - efficiency
747
+ - general_information
748
+ environment:
749
+ main_name: final_number_exact_match
750
+ main_split: test
751
+ taxonomy:
752
+ task: numeric answer question answering
753
+ what: grade school math word problems
754
+ who: contractors on Upwork and Surge AI
755
+ when: "2021"
756
+ language: English
757
+
758
+ - name: math_chain_of_thought
759
+ display_name: MATH
760
+ description: The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).
761
+ metric_groups:
762
+ - accuracy
763
+ - efficiency
764
+ - general_information
765
+ environment:
766
+ main_name: math_equiv_chain_of_thought
767
+ main_split: test
768
+ taxonomy:
769
+ task: numeric answer question answering
770
+ what: math competitions (AMC, AIME, etc.)
771
+ who: problem setters
772
+ when: before 2021
773
+ language: synthetic
774
+
775
+ - name: legalbench
776
+ display_name: LegalBench
777
+ description: LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).
778
+ metric_groups:
779
+ - accuracy
780
+ - efficiency
781
+ - general_information
782
+ environment:
783
+ main_name: quasi_exact_match
784
+ main_split: test
785
+ taxonomy:
786
+ task: multiple-choice question answering
787
+ what: public legal and admininstrative documents, manually constructed questions
788
+ who: lawyers
789
+ when: before 2023
790
+ language: English
791
+
792
+ - name: med_qa
793
+ display_name: MedQA
794
+ description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
795
+ metric_groups:
796
+ - accuracy
797
+ - efficiency
798
+ - general_information
799
+ environment:
800
+ main_name: quasi_exact_match
801
+ main_split: test
802
+ taxonomy:
803
+ task: multiple-choice question answering
804
+ what: US medical licensing exams
805
+ who: problem setters
806
+ when: before 2020
807
+ language: English
808
+
809
+ - name: wmt_14
810
+ display_name: WMT 2014
811
+ description: WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).
812
+ metric_groups:
813
+ - accuracy
814
+ - efficiency
815
+ - general_information
816
+ environment:
817
+ main_name: bleu_4
818
+ main_split: test
819
+ taxonomy:
820
+ task: machine translation
821
+ what: multilingual sentences
822
+ who: Europarl, news, Common Crawl, etc.
823
+ when: before 2014
824
+ language: English, French, Czech, etc.