crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1389 @@
1
+ ---
2
+ ############################################################
3
+ metrics:
4
+
5
+ - name: num_perplexity_tokens
6
+ display_name: "# tokens"
7
+ description: Average number of tokens in the predicted output (for language modeling,
8
+ the input too).
9
+
10
+ - name: num_bytes
11
+ display_name: "# bytes"
12
+ description: Average number of bytes in the predicted output (for language modeling,
13
+ the input too).
14
+
15
+ - name: num_references
16
+ display_name: "# ref"
17
+ description: Number of references.
18
+
19
+ - name: num_train_trials
20
+ display_name: "# trials"
21
+ description: Number of trials, where in each trial we choose an independent, random
22
+ set of training instances.
23
+
24
+ - name: estimated_num_tokens_cost
25
+ display_name: cost
26
+ description: An estimate of the number of tokens (including prompt and output completions)
27
+ needed to perform the request.
28
+
29
+ - name: num_prompt_tokens
30
+ display_name: "# prompt tokens"
31
+ description: Number of tokens in the prompt.
32
+
33
+ - name: num_prompt_characters
34
+ display_name: "# prompt chars"
35
+ description: Number of characters in the prompt.
36
+
37
+ - name: num_completion_tokens
38
+ display_name: "# completion tokens"
39
+ description: Actual number of completion tokens (over all completions).
40
+
41
+ - name: num_output_tokens
42
+ display_name: "# output tokens"
43
+ description: Actual number of output tokens.
44
+
45
+ - name: max_num_output_tokens
46
+ display_name: Max output tokens
47
+ description: Maximum number of output tokens (overestimate since we might stop earlier
48
+ due to stop sequences).
49
+
50
+ - name: num_requests
51
+ display_name: "# requests"
52
+ description: Number of distinct API requests.
53
+
54
+ - name: num_instances
55
+ display_name: "# eval"
56
+ description: Number of evaluation instances.
57
+
58
+ - name: num_train_instances
59
+ display_name: "# train"
60
+ description: Number of training instances (e.g., in-context examples).
61
+
62
+ - name: prompt_truncated
63
+ display_name: truncated
64
+ description: Fraction of instances where the prompt itself was truncated (implies
65
+ that there were no in-context examples).
66
+
67
+ - name: finish_reason_length
68
+ display_name: finish b/c length
69
+ description: Fraction of instances where the the output was terminated because of
70
+ the max tokens limit.
71
+
72
+ - name: finish_reason_stop
73
+ display_name: finish b/c stop
74
+ description: Fraction of instances where the the output was terminated because of
75
+ the stop sequences.
76
+
77
+ - name: finish_reason_endoftext
78
+ display_name: finish b/c endoftext
79
+ description: Fraction of instances where the the output was terminated because the
80
+ end of text token was generated.
81
+
82
+ - name: finish_reason_unknown
83
+ display_name: finish b/c unknown
84
+ description: Fraction of instances where the the output was terminated for unknown
85
+ reasons.
86
+
87
+ - name: num_completions
88
+ display_name: "# completions"
89
+ description: Number of completions.
90
+
91
+ - name: predicted_index
92
+ display_name: Predicted index
93
+ description: Integer index of the reference (0, 1, ...) that was predicted by the
94
+ model (for multiple-choice).
95
+
96
+ - name: training_co2_cost
97
+ display_name: Estimated training emissions (kg CO2)
98
+ short_display_name: Training emissions (kg CO2)
99
+ lower_is_better: true
100
+ description: Estimate of the CO2 emissions from training the model.
101
+
102
+ - name: training_energy_cost
103
+ display_name: Estimated training energy cost (MWh)
104
+ short_display_name: Training energy (MWh)
105
+ lower_is_better: true
106
+ description: Estimate of the amount of energy used to train the model.
107
+
108
+ - name: inference_runtime
109
+ display_name: Observed inference runtime (s)
110
+ short_display_name: Observed inference time (s)
111
+ lower_is_better: true
112
+ description: Average observed time to process a request to the model (via an API,
113
+ and thus depends on particular deployment).
114
+
115
+ - name: inference_idealized_runtime
116
+ display_name: Idealized inference runtime (s)
117
+ short_display_name: Idealized inference time (s)
118
+ lower_is_better: true
119
+ description: Average time to process a request to the model based solely on the
120
+ model architecture (using Megatron-LM).
121
+
122
+ - name: inference_denoised_runtime
123
+ display_name: Denoised inference runtime (s)
124
+ short_display_name: Denoised inference time (s)
125
+ lower_is_better: true
126
+ description: Average time to process a request to the model minus performance contention
127
+ by using profiled runtimes from multiple trials of SyntheticEfficiencyScenario.
128
+
129
+ - name: batch_size
130
+ display_name: Batch size
131
+ description: For batch jobs, how many requests are in a batch.
132
+
133
+ - name: fid
134
+ display_name: FID
135
+ short_display_name: FID
136
+ description: Fréchet Inception Distance (FID) is a metric used for evaluating the
137
+ quality of images generated by models. The FID compares the distribution of generated
138
+ images with the distribution of real images in feature space.
139
+ lower_is_better: true
140
+
141
+ - name: inception_score
142
+ display_name: Inception Score (IS)
143
+ short_display_name: Inception Score
144
+ description: Inception score (IS) is a metric used for evaluating the quality of
145
+ image-based generative models. The score is calculated based on the output of
146
+ a separate, pretrained Inception v3 image classification model applied to a sample
147
+ of generated images.
148
+ lower_is_better: false
149
+
150
+ - name: kernel_inception_distance
151
+ display_name: Kernel Inception Distance (KID)
152
+ short_display_name: Kernel Inception Distance
153
+ description: Kernel Inception Distance (KID) is a metric used to assess the quality
154
+ of image-based generative models. It was proposed to replace FID.
155
+ lower_is_better: true
156
+
157
+ - name: expected_lpips_score
158
+ display_name: Expected Learned Perceptual Image Patch Similarity (LPIPS) score
159
+ short_display_name: Expected LPIPS score
160
+ description: The Learned Perceptual Image Patch Similarity (LPIPS) is used to judge
161
+ the perceptual similarity between two images. LPIPS computes the similarity between
162
+ the activations of two image patches for some pre-defined network.
163
+ lower_is_better: true
164
+
165
+ - name: expected_multi_scale_ssim_score
166
+ display_name: Expected Multi-scale Structural Similarity Index Measure (SSIM)
167
+ short_display_name: Expected Multi-Scale SSIM
168
+ description: The Multi-scale Structural Similarity Index Measure (MS-SSIM) is measure
169
+ of image quality and a generalization of Structural Similarity Index Measure (SSIM)
170
+ by incorporating image details at different resolution scores.
171
+ lower_is_better: false
172
+
173
+ - name: expected_psnr_score
174
+ display_name: Expected Peak Signal-to-Noise Ratio (PSNR)
175
+ short_display_name: Expected PSNR
176
+ description: Peak signal-to-noise ratio (PSNR) is the ratio between the maximum
177
+ possible power of a signal and the power of corrupting noise that affects the
178
+ fidelity of its representation.
179
+ lower_is_better: false
180
+
181
+ - name: expected_uiqi_score
182
+ display_name: Expected Universal Image Quality Index (UIQI)
183
+ short_display_name: Expected UIQI
184
+ description: The Universal Image Quality Index (UIQI) is a full-reference image
185
+ quality assessment method that measures the similarity between two images by comparing
186
+ their luminance, contrast, and structure.
187
+ lower_is_better: false
188
+
189
+ - name: gender_imbalance
190
+ display_name: Gender imbalance
191
+ short_display_name: Gender imbalance
192
+ description: The gender imbalance metric assesses the presence of binary gender
193
+ bias within a set of images.
194
+ lower_is_better: true
195
+
196
+ - name: skin_tone_imbalance
197
+ display_name: Skin tone imbalance
198
+ short_display_name: Skin tone imbalance
199
+ description: This skin tone metric assesses the presence of skin tone bias within
200
+ a set of images.
201
+ lower_is_better: true
202
+
203
+ - name: expected_clip_score
204
+ display_name: Expected CLIP score
205
+ short_display_name: Expected CLIP score
206
+ description: CLIPscore measures how well an image is aligned with a corresponding
207
+ natural language description, using the pre-trained CLIP model.
208
+ lower_is_better: false
209
+
210
+ - name: max_clip_score
211
+ display_name: Maximum CLIP score
212
+ short_display_name: Max CLIP score
213
+ description: CLIPscore measures how well an image is aligned with a corresponding
214
+ natural language description, using the pre-trained CLIP model.
215
+ lower_is_better: false
216
+
217
+ - name: detection_correct_frac
218
+ display_name: Detection correct fraction
219
+ short_display_name: Detection correct frac
220
+ description: Fraction of correct images according to the ViTDet object detector
221
+ with ViT-B backbone and detectron2 library to automatically detect objects specified
222
+ in the prompts.
223
+ lower_is_better: false
224
+
225
+ - name: expected_aesthetics_score
226
+ display_name: Expected aesthetics score
227
+ short_display_name: Expected aesthetics score
228
+ description: Following LAION-Aesthetics for determining the aesthetic score of a
229
+ generated image.
230
+ lower_is_better: false
231
+
232
+ - name: max_aesthetics_score
233
+ display_name: Maximum aesthetics score
234
+ short_display_name: Max aesthetics score
235
+ description: Following LAION-Aesthetics for determining the aesthetic score of a
236
+ generated image.
237
+ lower_is_better: false
238
+
239
+ - name: aesthetics_human
240
+ display_name: Aesthetics (human)
241
+ short_display_name: Aesthetics (human)
242
+ description: How aesthetically pleasing is the image according to human evaluators.
243
+ lower_is_better: false
244
+
245
+ - name: image_text_alignment_human
246
+ display_name: Image text alignment (human)
247
+ short_display_name: Image text alignment (human)
248
+ description: How well does the image match the description according to human evaluators.
249
+ lower_is_better: false
250
+
251
+ - name: clear_subject_human
252
+ display_name: Clear subject (human)
253
+ short_display_name: Clear subject (human)
254
+ description: Is it clear who the subject(s) of the image is according to human evaluators.
255
+ lower_is_better: false
256
+
257
+ - name: originality_human
258
+ display_name: Originality (human)
259
+ short_display_name: Originality (human)
260
+ description: How original is the image, given it was created with the description
261
+ according to human evaluators.
262
+ lower_is_better: false
263
+
264
+ - name: copyright_human
265
+ display_name: Copyright (human)
266
+ short_display_name: Copyright (human)
267
+ description: Whether the image is a derivative work of another image according to
268
+ human evaluators.
269
+ lower_is_better: true
270
+
271
+ - name: toxic_prompt_perspective_api
272
+ display_name: Toxic prompt (Perspective API)
273
+ short_display_name: Toxic prompt (Perspective API)
274
+ description: Whether the prompt is toxic according to the Perspective API
275
+ lower_is_better: true
276
+
277
+ - name: toxic_prompt_openai
278
+ display_name: Toxic prompt (OpenAI)
279
+ short_display_name: Toxic prompt (OpenAI)
280
+ description: Whether the prompt violated [OpenAI's content policy](https://labs.openai.com/policies/content-policy).
281
+ lower_is_better: true
282
+
283
+ - name: expected_max_nsfw_score
284
+ display_name: Expected maximum NSFW score
285
+ short_display_name: Expected max NSFW score
286
+ description: NSFW (not safe for work) classifier from LAION
287
+ lower_is_better: true
288
+
289
+ - name: expected_max_nsfw_score_given_toxic_prompt=True
290
+ display_name: Expected maximum NSFW score given toxic prompt
291
+ short_display_name: Expected max NSFW score given toxic prompt
292
+ description: NSFW (not safe for work) classifier from LAION
293
+ lower_is_better: true
294
+
295
+ - name: expected_max_nsfw_score_given_toxic_prompt=False
296
+ display_name: Expected maximum NSFW score given non-toxic prompt
297
+ short_display_name: Expected max NSFW score given non-toxic prompt
298
+ description: NSFW (not safe for work) classifier from LAION
299
+ lower_is_better: true
300
+
301
+ - name: fractal_dimension_loss
302
+ display_name: Fractal dimension loss
303
+ short_display_name: Fractal dimension loss
304
+ description: Fractal coefficient is a measure of the fractal patterns in an image.
305
+ The ideal value is 1.4.
306
+ lower_is_better: true
307
+
308
+ - name: max_nsfw_probability
309
+ display_name: Expected maximum NSFW probability
310
+ short_display_name: Expected max NSFW probability
311
+ description: NSFW (not safe for work) classifier from LAION
312
+ lower_is_better: true
313
+
314
+ - name: max_nsfw_probability_given_toxic_prompt=True
315
+ display_name: Expected maximum NSFW probability given toxic prompt
316
+ short_display_name: Expected max NSFW probability given toxic prompt
317
+ description: NSFW (not safe for work) classifier from LAION
318
+ lower_is_better: true
319
+
320
+ - name: max_nsfw_probability_given_toxic_prompt=False
321
+ display_name: Expected maximum NSFW probability given non-toxic prompt
322
+ short_display_name: Expected max NSFW probability given non-toxic prompt
323
+ description: NSFW (not safe for work) classifier from LAION
324
+ lower_is_better: true
325
+
326
+ - name: nsfw_frac
327
+ display_name: NSFW frac
328
+ short_display_name: NSFW frac
329
+ description: NSFW (not safe for work) classifier from LAION
330
+ lower_is_better: true
331
+
332
+ - name: nsfw_frac_given_toxic_prompt=True
333
+ display_name: NSFW frac given toxic prompt
334
+ short_display_name: NSFW frac given toxic prompt
335
+ description: NSFW (not safe for work) classifier from LAION
336
+ lower_is_better: true
337
+
338
+ - name: nsfw_frac_given_toxic_prompt=False
339
+ display_name: NSFW frac given non-toxic prompt
340
+ short_display_name: NSFW frac given non-toxic prompt
341
+ description: NSFW (not safe for work) classifier from LAION
342
+ lower_is_better: true
343
+
344
+ - name: nudity_frac
345
+ display_name: Nudity frac
346
+ short_display_name: Nudity frac
347
+ description: Pre-trained NudeNet to determine whether an image contains nudity.
348
+ lower_is_better: true
349
+
350
+ - name: blacked_out_frac
351
+ display_name: Blacked out frac
352
+ short_display_name: Blacked out frac
353
+ description: This metric represents whether the image has been blacked out.
354
+ lower_is_better: true
355
+
356
+ - name: blacked_out_frac_given_toxic_prompt=True
357
+ display_name: Blacked out frac given toxic prompt
358
+ short_display_name: Blacked out frac given toxic prompt
359
+ description: This metric represents whether the image has been blacked out.
360
+ lower_is_better: true
361
+
362
+ - name: blacked_out_frac_given_toxic_prompt=False
363
+ display_name: Blacked out frac given non-toxic prompt
364
+ short_display_name: Blacked out frac given non-toxic prompt
365
+ description: This metric represents whether the image has been blacked out.
366
+ lower_is_better: true
367
+
368
+ - name: watermark_frac
369
+ display_name: Watermark frac
370
+ short_display_name: Watermark frac
371
+ description: Watermark detector from LAION to determine whether an image contains
372
+ watermarks.
373
+ lower_is_better: true
374
+
375
+ - name: expected_max_watermark_prob
376
+ display_name: Expected maximum watermark prob
377
+ short_display_name: Expected max watermark prob
378
+ description: Watermark detector from LAION to determine whether an image contains
379
+ watermarks.
380
+ lower_is_better: true
381
+
382
+ - name: photorealism_generated_human
383
+ display_name: Photorealism w/ generated images (human)
384
+ short_display_name: Photorealism - generated (human)
385
+ description: Determine if the following image is AI-generated or real according
386
+ to human raters.
387
+ lower_is_better: false
388
+
389
+ - name: photorealism_real_human
390
+ display_name: Photorealism w/ real images (human)
391
+ short_display_name: Photorealism - real (human)
392
+ description: Determine if the following image is AI-generated or real according
393
+ to human raters.
394
+ lower_is_better: false
395
+
396
+ - name: denoised_runtime
397
+ display_name: Denoised runtime (in seconds)
398
+ short_display_name: Denoised runtime (s)
399
+ description: Denoised runtime is the runtime with the performance variance factored
400
+ out as described [here](https://arxiv.org/abs/2305.02440).
401
+ lower_is_better: true
402
+
403
+ - name: prompt_length
404
+ display_name: Prompt length (number of characters)
405
+ short_display_name: Prompt length (characters)
406
+ description: The number of characters in the prompt
407
+ lower_is_better: false
408
+
409
+ - name: inference_runtime
410
+ display_name: Inference runtime (in seconds)
411
+ short_display_name: Inference runtime (s)
412
+ description: How long it took to generate the images
413
+ lower_is_better: true
414
+
415
+ - name: num_generated_images
416
+ display_name: Number of generated images
417
+ short_display_name: Number of generated images
418
+ description: The number of images the model generated
419
+ lower_is_better: false
420
+ perturbations:
421
+
422
+ - name: robustness
423
+ display_name: Robustness
424
+ description: Computes worst case over different robustness perturbations (misspellings,
425
+ formatting, contrast sets).
426
+
427
+ - name: fairness
428
+ display_name: Fairness
429
+ description: Computes worst case over different fairness perturbations (changing
430
+ dialect, race of names, gender).
431
+
432
+ - name: typos
433
+ display_name: Typos
434
+ description: 'Randomly adds typos to each token in the input with probability 0.05
435
+ and computes the per-instance worst-case performance between perturbed and unperturbed
436
+ versions.'
437
+
438
+ - name: synonym
439
+ display_name: Synonyms
440
+ description: 'Randomly substitutes words in the input with WordNet synonyms with
441
+ probability 0.5 and computes the per-instance worst-case performance between perturbed
442
+ and unperturbed versions.'
443
+
444
+ - name: dialect
445
+ display_name: SAE -> AAE
446
+ short_display_name: Dialect
447
+ description: 'Deterministically substitutes SAE words in input with AAE counterparts
448
+ using validated dictionary of [Ziems et al. (2022)](https://aclanthology.org/2022.acl-long.258/)
449
+ and computes the per-instance worst-case performance between perturbed and unperturbed
450
+ versions.'
451
+
452
+ - name: race
453
+ display_name: First names by race (White -> Black)
454
+ short_display_name: Race
455
+ description: 'Deterministically substitutes White first names with Black first names
456
+ sampled from the lists of [Caliskan et al. (2017)](https://www.science.org/doi/10.1126/science.aal4230)
457
+ and computes the per-instance worst-case performance between perturbed and unperturbed
458
+ versions.'
459
+
460
+ - name: gender
461
+ display_name: Pronouns by gender (Male -> Female)
462
+ short_display_name: Gender
463
+ description: 'Deterministically substitutes male pronouns with female pronouns and
464
+ computes the per-instance worst-case performance between perturbed and unperturbed
465
+ versions.'
466
+
467
+ - name: translate
468
+ display_name: Translate
469
+ short_display_name: Translate
470
+ description: 'Translate text to other languages.'
471
+
472
+ ############################################################
473
+ metric_groups:
474
+
475
+ - name: efficiency
476
+ display_name: Efficiency
477
+ metrics:
478
+ - name: inference_denoised_runtime
479
+ split: "${main_split}"
480
+
481
+ - name: efficiency_detailed
482
+ display_name: Efficiency
483
+ description: The efficiency of the model across both training and inference.
484
+ metrics:
485
+ - name: inference_runtime
486
+ split: "${main_split}"
487
+ - name: inference_idealized_runtime
488
+ split: "${main_split}"
489
+ - name: inference_denoised_runtime
490
+ split: "${main_split}"
491
+ - name: training_co2_cost
492
+ split: "${main_split}"
493
+ - name: training_energy_cost
494
+ split: "${main_split}"
495
+
496
+ - name: general_information
497
+ display_name: General information
498
+ metrics:
499
+ - name: num_instances
500
+ split: "${main_split}"
501
+ - name: prompt_truncated
502
+ split: "${main_split}"
503
+ - name: num_prompt_tokens
504
+ split: "${main_split}"
505
+
506
+ - name: heim_inception
507
+ display_name: HEIM inception metrics
508
+ metrics:
509
+ - name: fid
510
+ split: __all__
511
+ - name: inception_score
512
+ split: __all__
513
+ - name: kernel_inception_distance
514
+ split: __all__
515
+
516
+ - name: heim_fidelity
517
+ display_name: HEIM Image fidelity metrics
518
+ metrics:
519
+ - name: expected_lpips_score
520
+ split: "${main_split}"
521
+ - name: expected_multi_scale_ssim_score
522
+ split: "${main_split}"
523
+ - name: expected_psnr_score
524
+ split: "${main_split}"
525
+ - name: expected_uiqi_score
526
+ split: "${main_split}"
527
+
528
+ - name: heim_gender
529
+ display_name: HEIM gender metrics
530
+ metrics:
531
+ - name: gender_imbalance
532
+ split: "${main_split}"
533
+
534
+ - name: heim_skin_tone
535
+ display_name: HEIM skin tone metrics
536
+ metrics:
537
+ - name: skin_tone_imbalance
538
+ split: "${main_split}"
539
+
540
+ - name: heim_alignment_human_metrics
541
+ display_name: Image-text Alignment - Human Evaluation
542
+ metrics:
543
+ - name: image_text_alignment_human
544
+ split: __all__
545
+ perturbation_name: __all__
546
+
547
+ - name: heim_alignment_clip_metrics
548
+ display_name: Image-text Alignment - CLIP Score
549
+ metrics:
550
+ - name: expected_clip_score
551
+ split: "${main_split}"
552
+ - name: max_clip_score
553
+ split: "${main_split}"
554
+
555
+ - name: heim_quality_human_metrics
556
+ display_name: Quality - Human Evaluation
557
+ metrics:
558
+ - name: photorealism_generated_human
559
+ split: __all__
560
+ perturbation_name: __all__
561
+
562
+ - name: heim_quality_fid_metrics
563
+ display_name: Quality - FID
564
+ metrics:
565
+ - name: fid
566
+ split: __all__
567
+
568
+ - name: heim_quality_inception_metrics
569
+ display_name: Quality - Inception
570
+ metrics:
571
+ - name: inception_score
572
+ split: __all__
573
+ - name: kernel_inception_distance
574
+ split: __all__
575
+
576
+ - name: heim_quality_automated_metrics
577
+ display_name: Quality - Other Automated Metrics
578
+ metrics:
579
+ - name: expected_lpips_score
580
+ split: "${main_split}"
581
+ - name: expected_multi_scale_ssim_score
582
+ split: "${main_split}"
583
+ - name: expected_psnr_score
584
+ split: "${main_split}"
585
+ - name: expected_uiqi_score
586
+ split: "${main_split}"
587
+
588
+ - name: heim_aesthetics_human_metrics
589
+ display_name: Aesthetics - Human Evaluation
590
+ metrics:
591
+ - name: clear_subject_human
592
+ split: __all__
593
+ perturbation_name: __all__
594
+ - name: aesthetics_human
595
+ split: __all__
596
+ perturbation_name: __all__
597
+
598
+ - name: heim_aesthetics_automated_metrics
599
+ display_name: Aesthetics - Automated Metrics
600
+ metrics:
601
+ - name: expected_aesthetics_score
602
+ split: "${main_split}"
603
+ - name: max_aesthetics_score
604
+ split: "${main_split}"
605
+ - name: fractal_dimension_loss
606
+ split: "${main_split}"
607
+
608
+ - name: heim_originality_human_metrics
609
+ display_name: Originality - Human Evaluation
610
+ metrics:
611
+ - name: originality_human
612
+ split: __all__
613
+ perturbation_name: __all__
614
+
615
+ - name: heim_originality_watermark_metrics
616
+ display_name: Originality - Watermark
617
+ metrics:
618
+ - name: watermark_frac
619
+ split: "${main_split}"
620
+
621
+ - name: heim_reasoning_metrics
622
+ display_name: Reasoning
623
+ metrics:
624
+ - name: detection_correct_frac
625
+ split: "${main_split}"
626
+
627
+ - name: heim_bias_metrics
628
+ display_name: Bias
629
+ metrics:
630
+ - name: gender_imbalance
631
+ split: "${main_split}"
632
+ - name: skin_tone_imbalance
633
+ split: "${main_split}"
634
+
635
+ - name: heim_gender_metrics
636
+ display_name: Gender imbalance
637
+ metrics:
638
+ - name: gender_imbalance
639
+ split: "${main_split}"
640
+
641
+ - name: heim_skin_tone_metrics
642
+ display_name: Skin tone imbalance
643
+ metrics:
644
+ - name: skin_tone_imbalance
645
+ split: "${main_split}"
646
+
647
+ - name: heim_toxicity_metrics
648
+ display_name: Toxicity (NSFW)
649
+ metrics:
650
+ - name: nsfw_frac_given_toxic_prompt=False
651
+ split: "${main_split}"
652
+
653
+ - name: heim_toxicity_black_metrics
654
+ display_name: Toxicity (blacked out images)
655
+ metrics:
656
+ - name: blacked_out_frac
657
+ split: "${main_split}"
658
+
659
+ - name: heim_toxicity_nudity_metrics
660
+ display_name: Toxicity (nudity)
661
+ metrics:
662
+ - name: nudity_frac
663
+ split: "${main_split}"
664
+
665
+ - name: heim_efficiency_metrics
666
+ display_name: Efficiency
667
+ metrics:
668
+ - name: denoised_runtime
669
+ split: __all__
670
+
671
+ - name: heim_photorealism
672
+ display_name: HEIM photorealism metrics
673
+ metrics:
674
+ - name: photorealism_generated_human
675
+ split: __all__
676
+ perturbation_name: __all__
677
+
678
+ ############################################################
679
+ run_groups:
680
+
681
+ - name: core_scenarios
682
+ display_name: All scenarios
683
+ description: All scenarios
684
+ category: All scenarios
685
+ subgroups:
686
+ - mscoco_base
687
+ - mscoco_fid
688
+ - mscoco_efficiency
689
+ - mscoco_gender
690
+ - mscoco_dialect
691
+ - mscoco_robustness
692
+ - mscoco_chinese
693
+ - mscoco_hindi
694
+ - mscoco_spanish
695
+ - mscoco_art_styles
696
+ - cub200
697
+ - draw_bench_image_quality
698
+ - parti_prompts_image_quality
699
+ - daily_dalle
700
+ - landing_page
701
+ - logos
702
+ - magazine_cover
703
+ - common_syntactic_processes
704
+ - draw_bench_reasoning
705
+ - parti_prompts_reasoning
706
+ - relational_understanding
707
+ - detection
708
+ - winoground
709
+ - parti_prompts_knowledge
710
+ - draw_bench_knowledge
711
+ - time_most_significant_historical_figures
712
+ - demographic_stereotypes
713
+ - mental_disorders
714
+ - i2p
715
+
716
+ - name: heim_alignment_scenarios
717
+ display_name: Alignment
718
+ description: Is the image semantically correct given the text (image-text alignment)
719
+ category: Scenarios for specific aspects
720
+ subgroups:
721
+ - mscoco_base
722
+ - cub200
723
+ - draw_bench_image_quality
724
+ - draw_bench_reasoning
725
+ - draw_bench_knowledge
726
+ - parti_prompts_image_quality
727
+ - parti_prompts_reasoning
728
+ - parti_prompts_knowledge
729
+
730
+ - name: heim_quality_scenarios
731
+ display_name: Quality
732
+ description: Do the generated images look like real images/photos
733
+ category: Scenarios for specific aspects
734
+ subgroups:
735
+ - mscoco_base
736
+ - draw_bench_image_quality
737
+ - parti_prompts_image_quality
738
+
739
+ - name: heim_aesthetics_scenarios
740
+ display_name: Aesthetics
741
+ description: Is the image aesthetically pleasing
742
+ category: Scenarios for specific aspects
743
+ subgroups:
744
+ - mscoco_base
745
+ - mscoco_art_styles
746
+ - daily_dalle
747
+ - logos
748
+ - landing_page
749
+ - magazine_cover
750
+
751
+ - name: heim_originality_scenarios
752
+ display_name: Originality
753
+ description: Does the model generate creative images and prevent copyright infringement
754
+ category: Scenarios for specific aspects
755
+ subgroups:
756
+ - daily_dalle
757
+ - logos
758
+ - landing_page
759
+ - magazine_cover
760
+
761
+ - name: heim_reasoning_scenarios
762
+ display_name: Reasoning
763
+ description: Does the model understand objects, counts, and spatial relations (compositionality)
764
+ category: Scenarios for specific aspects
765
+ subgroups:
766
+ - common_syntactic_processes
767
+ - draw_bench_reasoning
768
+ - parti_prompts_reasoning
769
+ - relational_understanding
770
+ - detection
771
+ - winoground
772
+
773
+ - name: heim_knowledge_scenarios
774
+ display_name: Knowledge
775
+ description: Does the model have knowledge about the world or domains
776
+ category: Scenarios for specific aspects
777
+ subgroups:
778
+ - time_most_significant_historical_figures
779
+ - draw_bench_knowledge
780
+ - parti_prompts_knowledge
781
+
782
+ - name: heim_bias_scenarios
783
+ display_name: Bias
784
+ description: Are the generated images biased in demographic representation (e.g.,
785
+ gender, skin tone)
786
+ category: Scenarios for specific aspects
787
+ subgroups:
788
+ - demographic_stereotypes
789
+ - mental_disorders
790
+
791
+ - name: heim_toxicity_scenarios
792
+ display_name: Toxicity
793
+ description: Does the model generate toxic or inappropriate images (e.g., violence,
794
+ sexual, illegal content)
795
+ category: Scenarios for specific aspects
796
+ subgroups:
797
+ - i2p
798
+
799
+ - name: heim_fairness_dialect_scenarios
800
+ display_name: Fairness - African American Vernacular English (AAVE)
801
+ description: Does the model exhibit performance disparities across social groups.
802
+ The African American Vernacular English (AAVE) dialect perturbation converts each
803
+ word to the corresponding word in AAVE if one exists.
804
+ category: Scenarios for specific aspects
805
+ subgroups:
806
+ - mscoco_dialect
807
+
808
+ - name: heim_fairness_gender_scenarios
809
+ display_name: Fairness - Gender
810
+ description: Does the model exhibit performance disparities across social groups.
811
+ The gender perturbation maps male gender terms to female gender terms (e.g., son
812
+ to daughter and father to mother).
813
+ category: Scenarios for specific aspects
814
+ subgroups:
815
+ - mscoco_gender
816
+
817
+ - name: heim_robustness_scenarios
818
+ display_name: Robustness
819
+ description: Is the model robust to invariant input perturbations
820
+ category: Scenarios for specific aspects
821
+ subgroups:
822
+ - mscoco_robustness
823
+
824
+ - name: heim_multilinguality_chinese_scenarios
825
+ display_name: Multilinguality (Chinese)
826
+ description: Does the model support non-English languages.
827
+ category: Scenarios for specific aspects
828
+ subgroups:
829
+ - mscoco_chinese
830
+
831
+ - name: heim_multilinguality_hindi_scenarios
832
+ display_name: Multilinguality (Hindi)
833
+ description: Does the model support non-English languages.
834
+ category: Scenarios for specific aspects
835
+ subgroups:
836
+ - mscoco_hindi
837
+
838
+ - name: heim_multilinguality_spanish_scenarios
839
+ display_name: Multilinguality (Spanish)
840
+ description: Does the model support non-English languages.
841
+ category: Scenarios for specific aspects
842
+ subgroups:
843
+ - mscoco_spanish
844
+
845
+ - name: heim_fid_scenarios
846
+ display_name: Fidelity
847
+ description: Fidelity metrics computed with MS-COCO.
848
+ category: Scenarios for specific aspects
849
+ subgroups:
850
+ - mscoco_fid
851
+
852
+ - name: heim_efficiency_scenarios
853
+ display_name: Efficiency
854
+ description: How fast is inference for the model
855
+ category: Scenarios for specific aspects
856
+ subgroups:
857
+ - mscoco_efficiency
858
+
859
+ - name: heim_art_styles_scenarios
860
+ display_name: Art styles
861
+ description: To test the ability of these models to generate images in specific
862
+ art styles.
863
+ category: Scenarios for specific aspects
864
+ subgroups:
865
+ - mscoco_art_styles
866
+
867
+ - name: mscoco
868
+ display_name: MS-COCO (all)
869
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312)).
870
+ metric_groups:
871
+ - heim_alignment_human_metrics
872
+ - heim_alignment_clip_metrics
873
+ - heim_quality_human_metrics
874
+ - heim_quality_automated_metrics
875
+ - heim_aesthetics_human_metrics
876
+ - heim_aesthetics_automated_metrics
877
+ - heim_originality_watermark_metrics
878
+ - heim_toxicity_metrics
879
+ - heim_toxicity_black_metrics
880
+ - heim_toxicity_nudity_metrics
881
+ - general_information
882
+ environment:
883
+ main_split: valid
884
+ taxonomy:
885
+ task: Image quality
886
+
887
+ - name: mscoco_base
888
+ display_name: MS-COCO (base)
889
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312)).
890
+ metric_groups:
891
+ - heim_alignment_human_metrics
892
+ - heim_alignment_clip_metrics
893
+ - heim_quality_human_metrics
894
+ - heim_quality_automated_metrics
895
+ - heim_aesthetics_human_metrics
896
+ - heim_aesthetics_automated_metrics
897
+ - heim_originality_watermark_metrics
898
+ - heim_toxicity_metrics
899
+ - heim_toxicity_black_metrics
900
+ - heim_toxicity_nudity_metrics
901
+ - general_information
902
+ environment:
903
+ main_split: valid
904
+ taxonomy:
905
+ task: Image quality
906
+
907
+ - name: mscoco_fid
908
+ display_name: MS-COCO Fidelity
909
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
910
+ for fidelity. To compute the FID, we randomly selected 30,000 text prompts from
911
+ MS-COCO and generated a single image for each prompt using the text-to-image generation
912
+ model that we are evaluating. Then, we used [pytorch-fid](https://github.com/mseitzer/pytorch-fid)
913
+ to compute the FID between the set of real images associated with the prompts
914
+ and the set of generated images.
915
+ metric_groups:
916
+ - heim_quality_fid_metrics
917
+ - heim_quality_inception_metrics
918
+ - general_information
919
+ environment:
920
+ main_split: valid
921
+ taxonomy:
922
+ task: Image quality
923
+
924
+ - name: mscoco_efficiency
925
+ display_name: MS-COCO Efficiency
926
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
927
+ for efficiency.
928
+ metric_groups:
929
+ - heim_efficiency_metrics
930
+ - general_information
931
+ environment:
932
+ main_split: valid
933
+ taxonomy:
934
+ task: Efficiency
935
+
936
+ - name: mscoco_gender
937
+ display_name: MS-COCO (fairness - gender)
938
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
939
+ with the gender perturbation maps male gender terms to female gender terms (e.g.,
940
+ son to daughter and father to mother).
941
+ metric_groups:
942
+ - heim_alignment_human_metrics
943
+ - heim_quality_human_metrics
944
+ - heim_aesthetics_human_metrics
945
+ - general_information
946
+ environment:
947
+ main_split: valid
948
+ taxonomy:
949
+ task: Image quality
950
+
951
+ - name: mscoco_dialect
952
+ display_name: MS-COCO (fairness - AAVE dialect)
953
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
954
+ with the African American Vernacular English (AAVE) dialect perturbation, which
955
+ converts each word to the corresponding word in AAVE if one exists.
956
+ metric_groups:
957
+ - heim_alignment_human_metrics
958
+ - heim_quality_human_metrics
959
+ - heim_aesthetics_human_metrics
960
+ - general_information
961
+ environment:
962
+ main_split: valid
963
+ taxonomy:
964
+ task: Image quality
965
+
966
+ - name: mscoco_robustness
967
+ display_name: MS-COCO (robustness)
968
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
969
+ with input perturbations.
970
+ metric_groups:
971
+ - heim_alignment_human_metrics
972
+ - heim_quality_human_metrics
973
+ - heim_aesthetics_human_metrics
974
+ - general_information
975
+ environment:
976
+ main_split: valid
977
+ taxonomy:
978
+ task: Image quality
979
+
980
+ - name: mscoco_chinese
981
+ display_name: MS-COCO (Chinese)
982
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
983
+ with prompts translated to Chinese.
984
+ metric_groups:
985
+ - heim_alignment_human_metrics
986
+ - heim_quality_human_metrics
987
+ - heim_aesthetics_human_metrics
988
+ - general_information
989
+ environment:
990
+ main_split: valid
991
+ taxonomy:
992
+ task: Image quality
993
+
994
+ - name: mscoco_hindi
995
+ display_name: MS-COCO (Hindi)
996
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
997
+ with prompts translated to Hindi.
998
+ metric_groups:
999
+ - heim_alignment_human_metrics
1000
+ - heim_quality_human_metrics
1001
+ - heim_aesthetics_human_metrics
1002
+ - general_information
1003
+ environment:
1004
+ main_split: valid
1005
+ taxonomy:
1006
+ task: Image quality
1007
+
1008
+ - name: mscoco_spanish
1009
+ display_name: MS-COCO (Spanish)
1010
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
1011
+ with prompts translated to Spanish.
1012
+ metric_groups:
1013
+ - heim_alignment_human_metrics
1014
+ - heim_quality_human_metrics
1015
+ - heim_aesthetics_human_metrics
1016
+ - general_information
1017
+ environment:
1018
+ main_split: valid
1019
+ taxonomy:
1020
+ task: Image quality
1021
+
1022
+ - name: mscoco_art_styles
1023
+ display_name: MS-COCO (Art styles)
1024
+ description: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))
1025
+ with prompts that generate images in specific art styles.
1026
+ metric_groups:
1027
+ - heim_alignment_human_metrics
1028
+ - heim_quality_human_metrics
1029
+ - heim_aesthetics_human_metrics
1030
+ - general_information
1031
+ environment:
1032
+ main_split: valid
1033
+ taxonomy:
1034
+ task: Image quality
1035
+
1036
+ - name: cub200
1037
+ display_name: Caltech-UCSD Birds-200-2011
1038
+ description: Caltech-UCSD Birds-200-2011 is a challenging dataset of 200 bird species
1039
+ with 10 captions for each bird ([paper](https://authors.library.caltech.edu/27452/1/CUB_200_2011.pdf),
1040
+ [paper](https://arxiv.org/abs/1711.10485)).
1041
+ metric_groups:
1042
+ - heim_alignment_clip_metrics
1043
+ - heim_quality_automated_metrics
1044
+ - heim_aesthetics_automated_metrics
1045
+ - heim_originality_watermark_metrics
1046
+ - heim_toxicity_metrics
1047
+ - heim_toxicity_black_metrics
1048
+ - heim_toxicity_nudity_metrics
1049
+ - general_information
1050
+ environment:
1051
+ main_split: test
1052
+ taxonomy:
1053
+ task: Image quality
1054
+
1055
+ - name: draw_bench_image_quality
1056
+ display_name: DrawBench (image quality categories)
1057
+ description: A comprehensive and challenging set of prompts that support the evaluation
1058
+ and comparison of text-to-image models.
1059
+ metric_groups:
1060
+ - heim_alignment_human_metrics
1061
+ - heim_alignment_clip_metrics
1062
+ - heim_aesthetics_automated_metrics
1063
+ - heim_originality_watermark_metrics
1064
+ - heim_toxicity_metrics
1065
+ - heim_toxicity_black_metrics
1066
+ - heim_toxicity_nudity_metrics
1067
+ - general_information
1068
+ environment:
1069
+ main_split: test
1070
+ taxonomy:
1071
+ task: Image quality
1072
+
1073
+ - name: parti_prompts_image_quality
1074
+ display_name: PartiPrompts (image quality categories)
1075
+ description: PartiPrompts (P2) is a set of 1600 diverse English prompts that allow
1076
+ us to more comprehensively evaluate and test the limits of text-to-image synthesis
1077
+ models.
1078
+ metric_groups:
1079
+ - heim_alignment_human_metrics
1080
+ - heim_alignment_clip_metrics
1081
+ - heim_aesthetics_automated_metrics
1082
+ - heim_originality_watermark_metrics
1083
+ - heim_toxicity_metrics
1084
+ - heim_toxicity_black_metrics
1085
+ - heim_toxicity_nudity_metrics
1086
+ - general_information
1087
+ environment:
1088
+ main_split: test
1089
+ taxonomy:
1090
+ task: Image quality
1091
+
1092
+ - name: daily_dalle
1093
+ display_name: dailydall.e
1094
+ description: DALL-E 2 prompts from [Chad Nelson's Instagram](https://www.instagram.com/dailydall.e/)
1095
+ metric_groups:
1096
+ - heim_alignment_human_metrics
1097
+ - heim_alignment_clip_metrics
1098
+ - heim_aesthetics_human_metrics
1099
+ - heim_aesthetics_automated_metrics
1100
+ - heim_originality_human_metrics
1101
+ - heim_originality_watermark_metrics
1102
+ - heim_toxicity_metrics
1103
+ - heim_toxicity_black_metrics
1104
+ - heim_toxicity_nudity_metrics
1105
+ - general_information
1106
+ environment:
1107
+ main_split: test
1108
+ taxonomy:
1109
+ task: Originality
1110
+
1111
+ - name: landing_page
1112
+ display_name: Landing Page
1113
+ description: Prompts to generate landing pages for mobile or web applications.
1114
+ metric_groups:
1115
+ - heim_alignment_human_metrics
1116
+ - heim_alignment_clip_metrics
1117
+ - heim_aesthetics_human_metrics
1118
+ - heim_aesthetics_automated_metrics
1119
+ - heim_originality_human_metrics
1120
+ - heim_originality_watermark_metrics
1121
+ - heim_toxicity_metrics
1122
+ - heim_toxicity_black_metrics
1123
+ - heim_toxicity_nudity_metrics
1124
+ - general_information
1125
+ environment:
1126
+ main_split: test
1127
+ taxonomy:
1128
+ task: Originality
1129
+
1130
+ - name: logos
1131
+ display_name: Logos
1132
+ description: Prompts to generate logos for brands and companies
1133
+ metric_groups:
1134
+ - heim_alignment_human_metrics
1135
+ - heim_alignment_clip_metrics
1136
+ - heim_aesthetics_human_metrics
1137
+ - heim_aesthetics_automated_metrics
1138
+ - heim_originality_human_metrics
1139
+ - heim_originality_watermark_metrics
1140
+ - heim_toxicity_metrics
1141
+ - heim_toxicity_black_metrics
1142
+ - heim_toxicity_nudity_metrics
1143
+ - general_information
1144
+ environment:
1145
+ main_split: test
1146
+ taxonomy:
1147
+ task: Originality
1148
+
1149
+ - name: magazine_cover
1150
+ display_name: Magazine Cover Photos
1151
+ description: Prompts to generate magazine cover photos
1152
+ metric_groups:
1153
+ - heim_alignment_human_metrics
1154
+ - heim_alignment_clip_metrics
1155
+ - heim_aesthetics_human_metrics
1156
+ - heim_aesthetics_automated_metrics
1157
+ - heim_originality_human_metrics
1158
+ - heim_originality_watermark_metrics
1159
+ - heim_toxicity_metrics
1160
+ - heim_toxicity_black_metrics
1161
+ - heim_toxicity_nudity_metrics
1162
+ - general_information
1163
+ environment:
1164
+ main_split: test
1165
+ taxonomy:
1166
+ task: Originality
1167
+
1168
+ - name: common_syntactic_processes
1169
+ display_name: Common Syntactic Processes
1170
+ description: Prompts from 8 different grammatical phenomena ([paper](https://arxiv.org/abs/2210.12889)).
1171
+ metric_groups:
1172
+ - heim_alignment_human_metrics
1173
+ - heim_alignment_clip_metrics
1174
+ - heim_aesthetics_automated_metrics
1175
+ - heim_originality_watermark_metrics
1176
+ - heim_toxicity_metrics
1177
+ - heim_toxicity_black_metrics
1178
+ - heim_toxicity_nudity_metrics
1179
+ - general_information
1180
+ environment:
1181
+ main_split: test
1182
+ taxonomy:
1183
+ task: Reasoning
1184
+
1185
+ - name: draw_bench_reasoning
1186
+ display_name: DrawBench (reasoning categories)
1187
+ description: A comprehensive and challenging set of prompts that support the evaluation
1188
+ and comparison of text-to-image models.
1189
+ metric_groups:
1190
+ - heim_alignment_human_metrics
1191
+ - heim_alignment_clip_metrics
1192
+ - heim_aesthetics_automated_metrics
1193
+ - heim_originality_watermark_metrics
1194
+ - heim_toxicity_metrics
1195
+ - heim_toxicity_black_metrics
1196
+ - heim_toxicity_nudity_metrics
1197
+ - general_information
1198
+ environment:
1199
+ main_split: test
1200
+ taxonomy:
1201
+ task: Reasoning
1202
+
1203
+ - name: parti_prompts_reasoning
1204
+ display_name: PartiPrompts (reasoning categories)
1205
+ description: PartiPrompts (P2) is a set of 1600 diverse English prompts that allow
1206
+ us to more comprehensively evaluate and test the limits of text-to-image synthesis
1207
+ models.
1208
+ metric_groups:
1209
+ - heim_alignment_human_metrics
1210
+ - heim_alignment_clip_metrics
1211
+ - heim_aesthetics_automated_metrics
1212
+ - heim_originality_watermark_metrics
1213
+ - heim_toxicity_metrics
1214
+ - heim_toxicity_black_metrics
1215
+ - heim_toxicity_nudity_metrics
1216
+ - general_information
1217
+ environment:
1218
+ main_split: test
1219
+ taxonomy:
1220
+ task: Reasoning
1221
+
1222
+ - name: relational_understanding
1223
+ display_name: Relational Understanding
1224
+ description: Based on existing cognitive, linguistic, and developmental literature,
1225
+ the authors created a set of 15 relations (8 physical, 7 agentic) and a set of
1226
+ 12 entities (6 objects, 6 agents)
1227
+ metric_groups:
1228
+ - heim_alignment_human_metrics
1229
+ - heim_alignment_clip_metrics
1230
+ - heim_aesthetics_automated_metrics
1231
+ - heim_originality_watermark_metrics
1232
+ - heim_toxicity_metrics
1233
+ - heim_toxicity_black_metrics
1234
+ - heim_toxicity_nudity_metrics
1235
+ - general_information
1236
+ environment:
1237
+ main_split: test
1238
+ taxonomy:
1239
+ task: Reasoning
1240
+
1241
+ - name: detection
1242
+ display_name: Detection (PaintSkills)
1243
+ description: A compositional diagnostic dataset an evaluation toolkit that measures
1244
+ visual reasoning skills ([paper](https://arxiv.org/abs/2202.04053)).
1245
+ metric_groups:
1246
+ - heim_alignment_human_metrics
1247
+ - heim_alignment_clip_metrics
1248
+ - heim_aesthetics_automated_metrics
1249
+ - heim_originality_watermark_metrics
1250
+ - heim_reasoning_metrics
1251
+ - heim_toxicity_metrics
1252
+ - heim_toxicity_black_metrics
1253
+ - heim_toxicity_nudity_metrics
1254
+ - general_information
1255
+ environment:
1256
+ main_split: test
1257
+ taxonomy:
1258
+ task: Reasoning
1259
+
1260
+ - name: winoground
1261
+ display_name: Winoground
1262
+ description: Winoground is a novel task and dataset for evaluating the ability of
1263
+ vision and language models to conduct visio-linguistic compositional reasoning
1264
+ ([paper](https://arxiv.org/abs/2204.03162)).
1265
+ metric_groups:
1266
+ - heim_alignment_human_metrics
1267
+ - heim_alignment_clip_metrics
1268
+ - heim_quality_automated_metrics
1269
+ - heim_aesthetics_automated_metrics
1270
+ - heim_originality_watermark_metrics
1271
+ - heim_toxicity_metrics
1272
+ - heim_toxicity_black_metrics
1273
+ - heim_toxicity_nudity_metrics
1274
+ - general_information
1275
+ environment:
1276
+ main_split: test
1277
+ taxonomy:
1278
+ task: Reasoning
1279
+
1280
+ - name: parti_prompts_knowledge
1281
+ display_name: PartiPrompts (knowledge categories)
1282
+ description: PartiPrompts (P2) is a set of 1600 diverse English prompts that allow
1283
+ us to more comprehensively evaluate and test the limits of text-to-image synthesis
1284
+ models.
1285
+ metric_groups:
1286
+ - heim_alignment_human_metrics
1287
+ - heim_alignment_clip_metrics
1288
+ - heim_aesthetics_automated_metrics
1289
+ - heim_originality_watermark_metrics
1290
+ - heim_toxicity_metrics
1291
+ - heim_toxicity_black_metrics
1292
+ - heim_toxicity_nudity_metrics
1293
+ - general_information
1294
+ environment:
1295
+ main_split: test
1296
+ taxonomy:
1297
+ task: Knowledge
1298
+
1299
+ - name: draw_bench_knowledge
1300
+ display_name: DrawBench (knowledge categories)
1301
+ description: A comprehensive and challenging set of prompts that support the evaluation
1302
+ and comparison of text-to-image models.
1303
+ metric_groups:
1304
+ - heim_alignment_human_metrics
1305
+ - heim_alignment_clip_metrics
1306
+ - heim_aesthetics_automated_metrics
1307
+ - heim_originality_watermark_metrics
1308
+ - heim_toxicity_metrics
1309
+ - heim_toxicity_black_metrics
1310
+ - heim_toxicity_nudity_metrics
1311
+ - general_information
1312
+ environment:
1313
+ main_split: test
1314
+ taxonomy:
1315
+ task: Knowledge
1316
+
1317
+ - name: time_most_significant_historical_figures
1318
+ display_name: TIME's most significant historical figures
1319
+ description: People from TIME's "The 100 Most Significant Figures in History" list
1320
+ metric_groups:
1321
+ - heim_alignment_human_metrics
1322
+ - heim_alignment_clip_metrics
1323
+ - heim_aesthetics_automated_metrics
1324
+ - heim_originality_watermark_metrics
1325
+ - heim_toxicity_metrics
1326
+ - heim_toxicity_black_metrics
1327
+ - heim_toxicity_nudity_metrics
1328
+ - general_information
1329
+ environment:
1330
+ main_split: test
1331
+ taxonomy:
1332
+ task: Knowledge
1333
+
1334
+ - name: demographic_stereotypes
1335
+ display_name: Demographic Stereotypes
1336
+ description: Simple user prompts ("A photo of the face of...") generate images perpetuating
1337
+ dangerous racial, ethnic, gendered, class, and intersectional stereotypes.
1338
+ metric_groups:
1339
+ - heim_alignment_clip_metrics
1340
+ - heim_aesthetics_automated_metrics
1341
+ - heim_originality_watermark_metrics
1342
+ - heim_bias_metrics
1343
+ - heim_gender_metrics
1344
+ - heim_skin_tone_metrics
1345
+ - heim_toxicity_metrics
1346
+ - heim_toxicity_black_metrics
1347
+ - heim_toxicity_nudity_metrics
1348
+ - general_information
1349
+ environment:
1350
+ main_split: test
1351
+ taxonomy:
1352
+ task: Unbiasedness
1353
+
1354
+ - name: mental_disorders
1355
+ display_name: Mental Disorders
1356
+ description: Prompt text-to-image models with mental disorders listed by the World
1357
+ Health Organization
1358
+ metric_groups:
1359
+ - heim_alignment_clip_metrics
1360
+ - heim_aesthetics_automated_metrics
1361
+ - heim_originality_watermark_metrics
1362
+ - heim_bias_metrics
1363
+ - heim_gender_metrics
1364
+ - heim_skin_tone_metrics
1365
+ - heim_toxicity_metrics
1366
+ - heim_toxicity_black_metrics
1367
+ - heim_toxicity_nudity_metrics
1368
+ - general_information
1369
+ environment:
1370
+ main_split: test
1371
+ taxonomy:
1372
+ task: Unbiasedness
1373
+
1374
+ - name: i2p
1375
+ display_name: Inappropriate Image Prompts (I2P)
1376
+ description: Contains real user prompts for generative text-to-image prompts that
1377
+ are likely to produce inappropriate images ([paper](https://arxiv.org/abs/2211.05105)).
1378
+ metric_groups:
1379
+ - heim_alignment_clip_metrics
1380
+ - heim_aesthetics_automated_metrics
1381
+ - heim_originality_watermark_metrics
1382
+ - heim_toxicity_metrics
1383
+ - heim_toxicity_black_metrics
1384
+ - heim_toxicity_nudity_metrics
1385
+ - general_information
1386
+ environment:
1387
+ main_split: test
1388
+ taxonomy:
1389
+ task: Toxicity mitigation