crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,145 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MMEScenario(Scenario):
21
+ """
22
+ MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
23
+
24
+ Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
25
+ multimodal tasks, showing amazing emergent abilities in recent studies. However,
26
+ it is difficult for these case studies to fully reflect the performance of MLLM,
27
+ lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
28
+ the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
29
+ and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
30
+ that may arise from direct use of public datasets for evaluation, the annotations
31
+ of instruction-answer pairs are all manually designed. The concise instruction design
32
+ allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
33
+ Besides, with such an instruction, we can also easily carry out quantitative
34
+ statistics. We rephrase the answer type of MME to multiple-choice question-answering.
35
+ We use the multiple-choice metrics for 14 different evaluation tasks.
36
+
37
+ @article{fu2023mme,
38
+ title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
39
+ author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
40
+ Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
41
+ Sun, Xing and Wu, Yunsheng and Ji, Rongrong},
42
+ journal={arXiv preprint arXiv:2306.13394},
43
+ year={2023}
44
+ }
45
+
46
+ Paper: https://arxiv.org/abs/2306.13394
47
+ """
48
+
49
+ MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
50
+
51
+ SUBJECTS: List[str] = [
52
+ "existence",
53
+ "scene",
54
+ "posters",
55
+ "color",
56
+ "OCR",
57
+ "position",
58
+ "celebrity",
59
+ "artwork",
60
+ "commonsense_reasoning",
61
+ "numerical_calculation",
62
+ "landmark",
63
+ "count",
64
+ "text_translation",
65
+ "code_reasoning",
66
+ ]
67
+
68
+ name = "mme"
69
+ description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2306.13394))."
70
+ tags = ["vision-language"]
71
+ options: List[str] = ["Yes", "No"]
72
+
73
+ def __init__(self, subject: str):
74
+ super().__init__()
75
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
76
+ self._subject: str = subject
77
+
78
+ def get_label_from_answer(self, answer: str):
79
+ label: str
80
+ if answer == "Yes":
81
+ label = "A"
82
+ elif answer == "No":
83
+ label = "B"
84
+ else:
85
+ raise NotImplementedError(f"Invalid answer: {answer}")
86
+ return label
87
+
88
+ def remove_question_suffix_for_mcqa(self, question: str):
89
+ return question.replace("Please answer yes or no.", "").strip()
90
+
91
+ def get_question_id(self, question_id: str):
92
+ return question_id.split(".")[0].replace("/", "-")
93
+
94
+ def get_instances(self, output_path: str) -> List[Instance]:
95
+ images_path: str = os.path.join(output_path, "images")
96
+ ensure_directory_exists(images_path)
97
+
98
+ # There is only the test split in Unicorn benchmark
99
+ instances: List[Instance] = []
100
+ # Process the test set
101
+ # Two open-ended generation instances and
102
+ # one multi-choice generation instance per row
103
+ for row in tqdm(
104
+ load_dataset(
105
+ self.MME_HUGGINGFACE_DATASET_NAME,
106
+ split=TEST_SPLIT,
107
+ cache_dir=output_path,
108
+ )
109
+ ):
110
+ if row["category"] != self._subject:
111
+ continue
112
+ question_id: str = self.get_question_id(row["question_id"])
113
+ # Save the image locally
114
+ image_path: str = os.path.join(images_path, f"{question_id}.png")
115
+ if not os.path.exists(image_path):
116
+ row["image"].save(image_path)
117
+
118
+ question: str = self.remove_question_suffix_for_mcqa(row["question"])
119
+ answer: str = row["answer"]
120
+ references: List[Reference] = []
121
+
122
+ answer = self.get_label_from_answer(answer)
123
+ # The given correct answer is a letter, but we need an index
124
+ correct_answer_index: int = ord(answer) - ord("A")
125
+ # The options are originally appended to the question
126
+
127
+ for i, option in enumerate(self.options):
128
+ reference: Reference
129
+ is_correct: bool = i == correct_answer_index
130
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
131
+ references.append(reference)
132
+
133
+ content = [
134
+ MediaObject(location=image_path, content_type="image/png"),
135
+ MediaObject(text=question, content_type="text/plain"),
136
+ ]
137
+ instances.append(
138
+ Instance(
139
+ Input(multimedia_content=MultimediaObject(content)),
140
+ references=references,
141
+ split=TEST_SPLIT,
142
+ )
143
+ )
144
+
145
+ return instances
@@ -0,0 +1,187 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ VALID_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MMMUScenario(Scenario):
21
+ """
22
+ MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI
23
+
24
+ We introduce MMMU: a new benchmark designed to evaluate multimodal models on massive multi-discipline
25
+ tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously
26
+ collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines:
27
+ Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.
28
+ These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such
29
+ as charts, diagrams, maps, tables, music sheets, and chemical structures.
30
+
31
+ @article{yue2023mmmu,
32
+ title={MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI},
33
+ author={Xiang Yue and Yuansheng Ni and Kai Zhang and Tianyu Zheng and Ruoqi Liu and Ge Zhang and Samuel
34
+ Stevens and Dongfu Jiang and Weiming Ren and Yuxuan Sun and Cong Wei and Botao Yu and Ruibin Yuan and
35
+ Renliang Sun and Ming Yin and Boyuan Zheng and Zhenzhu Yang and Yibo Liu and Wenhao Huang and Huan Sun
36
+ and Yu Su and Wenhu Chen},
37
+ journal={arXiv preprint arXiv:2311.16502},
38
+ year={2023},
39
+ }
40
+
41
+ Paper: https://arxiv.org/abs/2311.16502
42
+ Website: https://mmmu-benchmark.github.io/
43
+ """
44
+
45
+ MMMU_HUGGINGFACE_DATASET_NAME: str = "MMMU/MMMU"
46
+ MAX_NUM_IMAGES: int = 7
47
+
48
+ SUBJECTS: List[str] = [
49
+ "Accounting",
50
+ "Agriculture",
51
+ "Architecture_and_Engineering",
52
+ "Art",
53
+ "Art_Theory",
54
+ "Basic_Medical_Science",
55
+ "Biology",
56
+ "Chemistry",
57
+ "Clinical_Medicine",
58
+ "Computer_Science",
59
+ "Design",
60
+ "Diagnostics_and_Laboratory_Medicine",
61
+ "Economics",
62
+ "Electronics",
63
+ "Energy_and_Power",
64
+ "Finance",
65
+ "Geography",
66
+ "History",
67
+ "Literature",
68
+ "Manage",
69
+ "Marketing",
70
+ "Materials",
71
+ "Math",
72
+ "Mechanical_Engineering",
73
+ "Music",
74
+ "Pharmacy",
75
+ "Physics",
76
+ "Psychology",
77
+ "Public_Health",
78
+ "Sociology",
79
+ ]
80
+
81
+ name = "mmmu"
82
+ description = (
83
+ "Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
84
+ "subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
85
+ )
86
+ tags = ["vision-language"]
87
+
88
+ def __init__(self, subject: str, question_type: str):
89
+ super().__init__()
90
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
91
+ self._subject: str = subject
92
+
93
+ assert question_type in ["multiple-choice", "open"], f"Invalid question type: {question_type}"
94
+ self._question_type: str = question_type
95
+
96
+ def get_instances(self, output_path: str) -> List[Instance]:
97
+ images_path: str = os.path.join(output_path, "images", self._subject)
98
+ ensure_directory_exists(images_path)
99
+
100
+ instances: List[Instance] = []
101
+
102
+ # Process the validation set
103
+ # There seems to be a dev set, but it's unavailable through load_dataset.
104
+ # When loading "dev", we get error: ValueError: Unknown split "dev". Should be one of ['test', 'validation'].
105
+ # The test set doesn't have answers, since the MMMU competition/leaderboard uses the test set
106
+ for row in tqdm(
107
+ load_dataset(self.MMMU_HUGGINGFACE_DATASET_NAME, self._subject, split="validation", cache_dir=output_path)
108
+ ):
109
+ # Skip questions that aren't in the subject we're interested in
110
+ if row["question_type"] != self._question_type:
111
+ continue
112
+
113
+ question_id: str = row["id"]
114
+ if self._subject not in question_id:
115
+ continue
116
+
117
+ question_template: str = row["question"]
118
+ options: List[str] = eval(row["options"]) # Convert the string to list of options
119
+ answer: str = row["answer"]
120
+
121
+ # Create the question. Questions can have text and images interleaved
122
+ question_template_to_image_path: Dict[str, str] = {}
123
+ content: List[MediaObject] = []
124
+ for img_number in range(1, self.MAX_NUM_IMAGES):
125
+ image_id: str = f"image_{img_number}"
126
+ if row[image_id] is None:
127
+ # At this point, there are no more images for this question
128
+ break
129
+
130
+ # Save the image locally
131
+ image_path: str = os.path.join(images_path, f"{question_id}_{image_id}.png")
132
+ if not os.path.exists(image_path):
133
+ row[image_id].save(image_path)
134
+
135
+ image_template_tag: str = f"<image {img_number}>"
136
+ question_template_to_image_path[image_template_tag] = image_path
137
+
138
+ # There are cases when the image is included, but it is not used either in the
139
+ # question template or in the answer options
140
+ if image_template_tag not in question_template:
141
+ # The image is not in the question template
142
+ continue
143
+
144
+ head, question_template = question_template.split(image_template_tag, 1)
145
+ if head:
146
+ content.append(MediaObject(text=head, content_type="text/plain"))
147
+ content.append(MediaObject(location=image_path, content_type="image/png"))
148
+
149
+ # Add the rest of the question template
150
+ if question_template:
151
+ content.append(MediaObject(text=question_template, content_type="text/plain"))
152
+
153
+ # Add the references
154
+ references: List[Reference] = []
155
+ if self._question_type == "multiple-choice":
156
+ # The given correct answer is a letter, but we need an index
157
+ correct_answer_index: int = ord(answer) - ord("A")
158
+
159
+ for i, option in enumerate(options):
160
+ reference: Reference
161
+ is_correct: bool = i == correct_answer_index
162
+
163
+ if option in question_template_to_image_path:
164
+ # Image answer choices
165
+ image_path = question_template_to_image_path[option]
166
+ image_choice = MediaObject(location=image_path, content_type="image/png")
167
+ reference = Reference(
168
+ Output(multimedia_content=MultimediaObject([image_choice])),
169
+ tags=[CORRECT_TAG] if is_correct else [],
170
+ )
171
+ else:
172
+ # Text answer choices
173
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
174
+
175
+ references.append(reference)
176
+ else:
177
+ references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
178
+
179
+ instances.append(
180
+ Instance(
181
+ Input(multimedia_content=MultimediaObject(content)),
182
+ references=references,
183
+ split=VALID_SPLIT,
184
+ )
185
+ )
186
+
187
+ return instances
@@ -0,0 +1,169 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MultipanelVQAScenario(Scenario):
21
+ """
22
+ Muffin or Chihuahua? Challenging Large Vision-Language Models with Multipanel VQA
23
+
24
+ We introduce Multipanel Visual Question Answering (MultipanelVQA), a novel benchmark
25
+ comprising 6,600 triplets of questions, answers, and multipanel images that specifically
26
+ challenge models in comprehending multipanel images. Our evaluation shows that questions in
27
+ the MultipanelVQA benchmark pose significant challenges to the state-of-the-art Large Vision
28
+ Language Models (LVLMs) tested, even though humans can attain approximately 99% accuracy on
29
+ these questions. There are two types of questions in two different situations in the
30
+ MultipanelVQA benchmark: multiple-choice or open-ended generation paired with real-world or
31
+ synthetic images. We use the multiple-choice metrics and the exact match metric for two
32
+ different question-answering types, respectively.
33
+
34
+ @article{fan2024muffin,
35
+ title={Muffin or Chihuahua? Challenging Large Vision-Language Models with Multipanel VQA},
36
+ author={Fan, Yue and Gu, Jing and Zhou, Kaiwen and Yan, Qianqi and Jiang, Shan and
37
+ Kuo, Ching-Chen and Guan, Xinze and Wang, Xin Eric},
38
+ journal={arXiv preprint arXiv:2401.15847},
39
+ year={2024}
40
+ }
41
+
42
+ Paper: https://arxiv.org/abs/2401.15847
43
+ """
44
+
45
+ MULTIPANELVQA_HUGGINGFACE_DATASET_NAME: Dict[str, str] = {
46
+ "synthetic": "yfan1997/MultipanelVQA_synthetic",
47
+ "real-world": "yfan1997/MultipanelVQA_real-world",
48
+ }
49
+
50
+ SUBJECTS: List[str] = ["synthetic", "real-world"]
51
+
52
+ name = "multipanelvqa"
53
+ description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2401.15847))."
54
+ tags = ["vision-language"]
55
+
56
+ def __init__(self, subject: str, question_type: str):
57
+ super().__init__()
58
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
59
+ self._subject: str = subject
60
+
61
+ assert question_type in ["multiple-choice", "open"], f"Invalid question type: {question_type}"
62
+ self._question_type: str = question_type
63
+
64
+ def convert_text_answer_to_option(self, text_answer: str, question: str):
65
+ option_answer: str
66
+ # Some answer may have a ')' with it
67
+ if len(text_answer) <= 3:
68
+ option_answer = text_answer[0]
69
+ else:
70
+ # There are examples where the answer is the text answer
71
+ # instead of an option
72
+ for line in question.split("\n"):
73
+ if text_answer in line:
74
+ option_answer = line[0]
75
+ break
76
+ return option_answer.upper()
77
+
78
+ def split_options_and_question(self, original_question: str):
79
+ question_and_options: List[str] = [item.strip().lower() for item in original_question.split("\n")]
80
+ last_append_phrase: str = "(please select one)"
81
+ question: str = question_and_options[0]
82
+ options: List[str] = []
83
+ if len(question_and_options) >= 6:
84
+ for item in question_and_options[1:]:
85
+ if last_append_phrase in item:
86
+ break
87
+ options.append(item[3:])
88
+ elif len(question_and_options) == 5:
89
+ for item in question_and_options[1:]:
90
+ if last_append_phrase in item:
91
+ item = item[: -len(last_append_phrase)]
92
+ options.append(item[3:])
93
+ return question, options
94
+
95
+ def get_instances(self, output_path: str) -> List[Instance]:
96
+ images_path: str = os.path.join(output_path, "images")
97
+ ensure_directory_exists(images_path)
98
+
99
+ # There is only the test split in Unicorn benchmark
100
+ instances: List[Instance] = []
101
+ # Process the test set
102
+ # Two open-ended generation instances and
103
+ # one multi-choice generation instance per row
104
+ for image_index, row in enumerate(
105
+ tqdm(
106
+ load_dataset(
107
+ self.MULTIPANELVQA_HUGGINGFACE_DATASET_NAME[self._subject],
108
+ split=TEST_SPLIT,
109
+ cache_dir=output_path,
110
+ )
111
+ )
112
+ ):
113
+ # Download the image
114
+ # Save the image locally
115
+ image_path: str = os.path.join(images_path, f"{image_index}.png")
116
+ if not os.path.exists(image_path):
117
+ row["image"].save(image_path)
118
+
119
+ # Add the references
120
+ references: List[Reference] = []
121
+ question: str
122
+ answer: str
123
+ content: List[MediaObject]
124
+ if self._question_type == "open":
125
+ question_1: str = row["question_1"]
126
+ question_2: str = row["question_2"]
127
+ answer_1: str = row["answer_1"]
128
+ answer_2: str = row["answer_2"]
129
+ for answer, question in zip([answer_1, answer_2], [question_1, question_2]):
130
+ content = [
131
+ MediaObject(location=image_path, content_type="image/png"),
132
+ MediaObject(text=question, content_type="text/plain"),
133
+ ]
134
+ instances.append(
135
+ Instance(
136
+ Input(multimedia_content=MultimediaObject(content)),
137
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
138
+ split=TEST_SPLIT,
139
+ )
140
+ )
141
+ else:
142
+ options: List[str]
143
+ original_question: str = row["question_3"]
144
+ question, options = self.split_options_and_question(original_question)
145
+ answer = row["answer_3"].strip()
146
+ answer = self.convert_text_answer_to_option(answer, original_question)
147
+ # The given correct answer is a letter, but we need an index
148
+ correct_answer_index: int = ord(answer) - ord("A")
149
+ # The options are originally appended to the question
150
+
151
+ for i, option in enumerate(options):
152
+ reference: Reference
153
+ is_correct: bool = i == correct_answer_index
154
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
155
+ references.append(reference)
156
+
157
+ content = [
158
+ MediaObject(location=image_path, content_type="image/png"),
159
+ MediaObject(text=question, content_type="text/plain"),
160
+ ]
161
+ instances.append(
162
+ Instance(
163
+ Input(multimedia_content=MultimediaObject(content)),
164
+ references=references,
165
+ split=TEST_SPLIT,
166
+ )
167
+ )
168
+
169
+ return instances
@@ -0,0 +1,104 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ CORRECT_TAG,
6
+ TEST_SPLIT,
7
+ Instance,
8
+ Input,
9
+ Output,
10
+ Reference,
11
+ Scenario,
12
+ )
13
+ from datasets import load_dataset
14
+ from tqdm import tqdm
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.general import ensure_directory_exists
17
+
18
+
19
+ class POPEScenario(Scenario):
20
+ """
21
+ POPE dataset
22
+ Despite the promising progress on Large Vision-Language Models (LVLMs), we find that LVLMs suffer from
23
+ the hallucination problem, i.e. they tend to generate objects that are inconsistent with the target
24
+ images in the descriptions. To investigate it, this work presents the first systematic study on object
25
+ hallucination of LVLMs based on VQAv2 benchmark. We find that: objects that frequently occur in the
26
+ visual instructions or co-occur with the image objects, are obviously prone to be hallucinated by LVLMs.
27
+ In POPE, images from VQAv2 are matched with questions asking the appearance of certain objects in the
28
+ image. We use the exact match metric for model evaluation on POPE.
29
+
30
+ @inproceedings{li2023evaluating,
31
+ title={Evaluating Object Hallucination in Large Vision-Language Models},
32
+ author={Li, Yifan and Du, Yifan and Zhou, Kun and Wang, Jinpeng and Zhao, Wayne Xin and Wen, Ji-Rong},
33
+ booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
34
+ pages={292--305},
35
+ year={2023}
36
+ }
37
+
38
+ Paper: https://aclanthology.org/2023.emnlp-main.20/
39
+ """
40
+
41
+ POPE_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/POPE"
42
+
43
+ name = "pope"
44
+ description = (
45
+ "Open-ended questions about hallucination images ([paper](https://aclanthology.org/2023.emnlp-main.20/))."
46
+ )
47
+ tags = ["vision-language", "visual question answering"]
48
+ options: List[str] = ["Yes", "No"]
49
+
50
+ def get_label_from_answer(self, answer: str):
51
+ label: str
52
+ if answer == "yes":
53
+ label = "A"
54
+ elif answer == "no":
55
+ label = "B"
56
+ else:
57
+ raise NotImplementedError(f"Invalid answer: {answer}")
58
+ return label
59
+
60
+ def get_instances(self, output_path: str) -> List[Instance]:
61
+ images_path: str = os.path.join(output_path, "images")
62
+ ensure_directory_exists(images_path)
63
+ instances: List[Instance] = []
64
+ for row in tqdm(
65
+ load_dataset(
66
+ self.POPE_HUGGINGFACE_DATASET_NAME,
67
+ split=TEST_SPLIT,
68
+ cache_dir=output_path,
69
+ )
70
+ ):
71
+ image_source: str = row["image_source"]
72
+ # Save the image locally
73
+ image_path: str = os.path.join(images_path, f"{image_source}.jpg")
74
+ if not os.path.exists(image_path):
75
+ row["image"].save(image_path)
76
+
77
+ question: str = row["question"]
78
+ answer: str = row["answer"]
79
+ references: List[Reference] = []
80
+
81
+ answer = self.get_label_from_answer(answer)
82
+ # The given correct answer is a letter, but we need an index
83
+ correct_answer_index: int = ord(answer) - ord("A")
84
+ # The options are originally appended to the question
85
+
86
+ for i, option in enumerate(self.options):
87
+ reference: Reference
88
+ is_correct: bool = i == correct_answer_index
89
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
90
+ references.append(reference)
91
+
92
+ content = [
93
+ MediaObject(location=image_path, content_type="image/jpeg"),
94
+ MediaObject(text=question, content_type="text/plain"),
95
+ ]
96
+ instances.append(
97
+ Instance(
98
+ Input(multimedia_content=MultimediaObject(content)),
99
+ references=references,
100
+ split=TEST_SPLIT,
101
+ )
102
+ )
103
+
104
+ return instances