crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,169 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MultipanelVQAScenario(Scenario):
21
+ """
22
+ Muffin or Chihuahua? Challenging Large Vision-Language Models with Multipanel VQA
23
+
24
+ We introduce Multipanel Visual Question Answering (MultipanelVQA), a novel benchmark
25
+ comprising 6,600 triplets of questions, answers, and multipanel images that specifically
26
+ challenge models in comprehending multipanel images. Our evaluation shows that questions in
27
+ the MultipanelVQA benchmark pose significant challenges to the state-of-the-art Large Vision
28
+ Language Models (LVLMs) tested, even though humans can attain approximately 99% accuracy on
29
+ these questions. There are two types of questions in two different situations in the
30
+ MultipanelVQA benchmark: multiple-choice or open-ended generation paired with real-world or
31
+ synthetic images. We use the multiple-choice metrics and the exact match metric for two
32
+ different question-answering types, respectively.
33
+
34
+ @article{fan2024muffin,
35
+ title={Muffin or Chihuahua? Challenging Large Vision-Language Models with Multipanel VQA},
36
+ author={Fan, Yue and Gu, Jing and Zhou, Kaiwen and Yan, Qianqi and Jiang, Shan and
37
+ Kuo, Ching-Chen and Guan, Xinze and Wang, Xin Eric},
38
+ journal={arXiv preprint arXiv:2401.15847},
39
+ year={2024}
40
+ }
41
+
42
+ Paper: https://arxiv.org/abs/2401.15847
43
+ """
44
+
45
+ MULTIPANELVQA_HUGGINGFACE_DATASET_NAME: Dict[str, str] = {
46
+ "synthetic": "yfan1997/MultipanelVQA_synthetic",
47
+ "real-world": "yfan1997/MultipanelVQA_real-world",
48
+ }
49
+
50
+ SUBJECTS: List[str] = ["synthetic", "real-world"]
51
+
52
+ name = "multipanelvqa"
53
+ description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2401.15847))."
54
+ tags = ["vision-language"]
55
+
56
+ def __init__(self, subject: str, question_type: str):
57
+ super().__init__()
58
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
59
+ self._subject: str = subject
60
+
61
+ assert question_type in ["multiple-choice", "open"], f"Invalid question type: {question_type}"
62
+ self._question_type: str = question_type
63
+
64
+ def convert_text_answer_to_option(self, text_answer: str, question: str):
65
+ option_answer: str
66
+ # Some answer may have a ')' with it
67
+ if len(text_answer) <= 3:
68
+ option_answer = text_answer[0]
69
+ else:
70
+ # There are examples where the answer is the text answer
71
+ # instead of an option
72
+ for line in question.split("\n"):
73
+ if text_answer in line:
74
+ option_answer = line[0]
75
+ break
76
+ return option_answer.upper()
77
+
78
+ def split_options_and_question(self, original_question: str):
79
+ question_and_options: List[str] = [item.strip().lower() for item in original_question.split("\n")]
80
+ last_append_phrase: str = "(please select one)"
81
+ question: str = question_and_options[0]
82
+ options: List[str] = []
83
+ if len(question_and_options) >= 6:
84
+ for item in question_and_options[1:]:
85
+ if last_append_phrase in item:
86
+ break
87
+ options.append(item[3:])
88
+ elif len(question_and_options) == 5:
89
+ for item in question_and_options[1:]:
90
+ if last_append_phrase in item:
91
+ item = item[: -len(last_append_phrase)]
92
+ options.append(item[3:])
93
+ return question, options
94
+
95
+ def get_instances(self, output_path: str) -> List[Instance]:
96
+ images_path: str = os.path.join(output_path, "images")
97
+ ensure_directory_exists(images_path)
98
+
99
+ # There is only the test split in Unicorn benchmark
100
+ instances: List[Instance] = []
101
+ # Process the test set
102
+ # Two open-ended generation instances and
103
+ # one multi-choice generation instance per row
104
+ for image_index, row in enumerate(
105
+ tqdm(
106
+ load_dataset(
107
+ self.MULTIPANELVQA_HUGGINGFACE_DATASET_NAME[self._subject],
108
+ split=TEST_SPLIT,
109
+ cache_dir=output_path,
110
+ )
111
+ )
112
+ ):
113
+ # Download the image
114
+ # Save the image locally
115
+ image_path: str = os.path.join(images_path, f"{image_index}.png")
116
+ if not os.path.exists(image_path):
117
+ row["image"].save(image_path)
118
+
119
+ # Add the references
120
+ references: List[Reference] = []
121
+ question: str
122
+ answer: str
123
+ content: List[MediaObject]
124
+ if self._question_type == "open":
125
+ question_1: str = row["question_1"]
126
+ question_2: str = row["question_2"]
127
+ answer_1: str = row["answer_1"]
128
+ answer_2: str = row["answer_2"]
129
+ for answer, question in zip([answer_1, answer_2], [question_1, question_2]):
130
+ content = [
131
+ MediaObject(location=image_path, content_type="image/png"),
132
+ MediaObject(text=question, content_type="text/plain"),
133
+ ]
134
+ instances.append(
135
+ Instance(
136
+ Input(multimedia_content=MultimediaObject(content)),
137
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
138
+ split=TEST_SPLIT,
139
+ )
140
+ )
141
+ else:
142
+ options: List[str]
143
+ original_question: str = row["question_3"]
144
+ question, options = self.split_options_and_question(original_question)
145
+ answer = row["answer_3"].strip()
146
+ answer = self.convert_text_answer_to_option(answer, original_question)
147
+ # The given correct answer is a letter, but we need an index
148
+ correct_answer_index: int = ord(answer) - ord("A")
149
+ # The options are originally appended to the question
150
+
151
+ for i, option in enumerate(options):
152
+ reference: Reference
153
+ is_correct: bool = i == correct_answer_index
154
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
155
+ references.append(reference)
156
+
157
+ content = [
158
+ MediaObject(location=image_path, content_type="image/png"),
159
+ MediaObject(text=question, content_type="text/plain"),
160
+ ]
161
+ instances.append(
162
+ Instance(
163
+ Input(multimedia_content=MultimediaObject(content)),
164
+ references=references,
165
+ split=TEST_SPLIT,
166
+ )
167
+ )
168
+
169
+ return instances
@@ -0,0 +1,104 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ CORRECT_TAG,
6
+ TEST_SPLIT,
7
+ Instance,
8
+ Input,
9
+ Output,
10
+ Reference,
11
+ Scenario,
12
+ )
13
+ from datasets import load_dataset
14
+ from tqdm import tqdm
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.general import ensure_directory_exists
17
+
18
+
19
+ class POPEScenario(Scenario):
20
+ """
21
+ POPE dataset
22
+ Despite the promising progress on Large Vision-Language Models (LVLMs), we find that LVLMs suffer from
23
+ the hallucination problem, i.e. they tend to generate objects that are inconsistent with the target
24
+ images in the descriptions. To investigate it, this work presents the first systematic study on object
25
+ hallucination of LVLMs based on VQAv2 benchmark. We find that: objects that frequently occur in the
26
+ visual instructions or co-occur with the image objects, are obviously prone to be hallucinated by LVLMs.
27
+ In POPE, images from VQAv2 are matched with questions asking the appearance of certain objects in the
28
+ image. We use the exact match metric for model evaluation on POPE.
29
+
30
+ @inproceedings{li2023evaluating,
31
+ title={Evaluating Object Hallucination in Large Vision-Language Models},
32
+ author={Li, Yifan and Du, Yifan and Zhou, Kun and Wang, Jinpeng and Zhao, Wayne Xin and Wen, Ji-Rong},
33
+ booktitle={Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
34
+ pages={292--305},
35
+ year={2023}
36
+ }
37
+
38
+ Paper: https://aclanthology.org/2023.emnlp-main.20/
39
+ """
40
+
41
+ POPE_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/POPE"
42
+
43
+ name = "pope"
44
+ description = (
45
+ "Open-ended questions about hallucination images ([paper](https://aclanthology.org/2023.emnlp-main.20/))."
46
+ )
47
+ tags = ["vision-language", "visual question answering"]
48
+ options: List[str] = ["Yes", "No"]
49
+
50
+ def get_label_from_answer(self, answer: str):
51
+ label: str
52
+ if answer == "yes":
53
+ label = "A"
54
+ elif answer == "no":
55
+ label = "B"
56
+ else:
57
+ raise NotImplementedError(f"Invalid answer: {answer}")
58
+ return label
59
+
60
+ def get_instances(self, output_path: str) -> List[Instance]:
61
+ images_path: str = os.path.join(output_path, "images")
62
+ ensure_directory_exists(images_path)
63
+ instances: List[Instance] = []
64
+ for row in tqdm(
65
+ load_dataset(
66
+ self.POPE_HUGGINGFACE_DATASET_NAME,
67
+ split=TEST_SPLIT,
68
+ cache_dir=output_path,
69
+ )
70
+ ):
71
+ image_source: str = row["image_source"]
72
+ # Save the image locally
73
+ image_path: str = os.path.join(images_path, f"{image_source}.jpg")
74
+ if not os.path.exists(image_path):
75
+ row["image"].save(image_path)
76
+
77
+ question: str = row["question"]
78
+ answer: str = row["answer"]
79
+ references: List[Reference] = []
80
+
81
+ answer = self.get_label_from_answer(answer)
82
+ # The given correct answer is a letter, but we need an index
83
+ correct_answer_index: int = ord(answer) - ord("A")
84
+ # The options are originally appended to the question
85
+
86
+ for i, option in enumerate(self.options):
87
+ reference: Reference
88
+ is_correct: bool = i == correct_answer_index
89
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
90
+ references.append(reference)
91
+
92
+ content = [
93
+ MediaObject(location=image_path, content_type="image/jpeg"),
94
+ MediaObject(text=question, content_type="text/plain"),
95
+ ]
96
+ instances.append(
97
+ Instance(
98
+ Input(multimedia_content=MultimediaObject(content)),
99
+ references=references,
100
+ split=TEST_SPLIT,
101
+ )
102
+ )
103
+
104
+ return instances
@@ -0,0 +1,129 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class SEEDBenchScenario(Scenario):
21
+ """
22
+ SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension
23
+
24
+ Based on powerful Large Language Models (LLMs), recent generative Multimodal
25
+ Large Language Models (MLLMs) have gained prominence as a pivotal research area.
26
+ In Seed-Bench, we address the evaluation of generative comprehension in MLLMs
27
+ as a preliminary step towards a comprehensive assessment of generative models.
28
+ SEED-Bench consists of 19K multiple choice questions with accurate human annotations
29
+ (x 6 larger than existing benchmarks), which spans 12 evaluation dimensions
30
+ including the comprehension of both the image and video modality. We select 9
31
+ evaluation aspects that take image as the input. In the benchmark,
32
+ Multiple-choice questions with groundtruth options derived from human
33
+ annotation enables an objective and efficient assessment of model performance,
34
+ eliminating the need for human or GPT intervention during evaluation. We employ
35
+ the multiple-choice metric for evaluating the performance of models.
36
+
37
+ @article{li2023seed,
38
+ title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
39
+ author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
40
+ journal={arXiv preprint arXiv:2307.16125},
41
+ year={2023}
42
+ }
43
+
44
+ Paper: https://arxiv.org/abs/2307.16125
45
+ """
46
+
47
+ SEED_BENCH_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/SEED-Bench"
48
+
49
+ SUBJECTS: Dict[str, int] = {
50
+ "scene-understanding": 1,
51
+ "instance-identity": 2,
52
+ "instance-attributes": 3,
53
+ "instance-location": 4,
54
+ "instances-counting": 5,
55
+ "spatial-relation": 6,
56
+ "instance-interaction": 7,
57
+ "visual-reasoning": 8,
58
+ "text-understanding": 9,
59
+ }
60
+
61
+ name = "seed_bench"
62
+ description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2307.16125))."
63
+ tags = ["vision-language"]
64
+
65
+ def __init__(self, subject: str):
66
+ super().__init__()
67
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
68
+ self._subject: str = subject
69
+
70
+ def get_subject_name(self, subject_name: str) -> str:
71
+ return "-".join(subject_name.lower().split())
72
+
73
+ def get_instances(self, output_path: str) -> List[Instance]:
74
+ images_path: str = os.path.join(output_path, "images")
75
+ ensure_directory_exists(images_path)
76
+
77
+ # There is only the test split in Unicorn benchmark
78
+ instances: List[Instance] = []
79
+ # Process the test set
80
+ # Two open-ended generation instances and
81
+ # one multi-choice generation instance per row
82
+ for row in tqdm(
83
+ load_dataset(
84
+ self.SEED_BENCH_HUGGINGFACE_DATASET_NAME,
85
+ split=TEST_SPLIT,
86
+ cache_dir=output_path,
87
+ )
88
+ ):
89
+ question_type_key: str = self.get_subject_name(self._subject)
90
+ if row["question_type_id"] != self.SUBJECTS[question_type_key]:
91
+ continue
92
+ question_id: str = row["question_id"]
93
+ # Download the image
94
+ # Save the image locally
95
+ image_path: str = os.path.join(images_path, f"{question_id}.png")
96
+ if not os.path.exists(image_path):
97
+ # some images are CMYK mode, convert to RGB.
98
+ row["image"][0].convert("RGB").save(image_path, "PNG", optimize=True)
99
+
100
+ # Add the references
101
+ references: List[Reference] = []
102
+ question: str = row["question"]
103
+ answer: str
104
+ content: List[MediaObject]
105
+ options: List[str] = [row["choice_a"], row["choice_b"], row["choice_c"], row["choice_d"]]
106
+ answer = row["answer"].strip()
107
+ # The given correct answer is a letter, but we need an index
108
+ correct_answer_index: int = ord(answer) - ord("A")
109
+ # The options are originally appended to the question
110
+
111
+ for i, option in enumerate(options):
112
+ reference: Reference
113
+ is_correct: bool = i == correct_answer_index
114
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
115
+ references.append(reference)
116
+
117
+ content = [
118
+ MediaObject(location=image_path, content_type="image/png"),
119
+ MediaObject(text=question, content_type="text/plain"),
120
+ ]
121
+ instances.append(
122
+ Instance(
123
+ Input(multimedia_content=MultimediaObject(content)),
124
+ references=references,
125
+ split=TEST_SPLIT,
126
+ )
127
+ )
128
+
129
+ return instances
@@ -0,0 +1,108 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+
19
+
20
+ class UnicornScenario(Scenario):
21
+ """
22
+ How Many Unicorns are in this Image? A Safety Evaluation Benchmark of Vision LLMs
23
+
24
+ We shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation
25
+ suite Unicorn, covering both out-of-distribution (OOD) generalization and adversarial robustness. For the OOD
26
+ evaluation, we present two novel VQA datasets --- OODCV-VQA and Sketchy-VQA, each with one variant, designed
27
+ to test model performance under challenging conditions. In the OOD scenario, questions are matched with
28
+ boolean or numerical answers, and we use exact match metrics for evaluation. When comparing OOD Sketchy-VQA
29
+ with its synthesized in-distribution counterpart, we found an average model output F1 drop of 8.9%,
30
+ highlighting the challenging nature of the OOD scenario in the Unicorn benchmark.
31
+
32
+ @article{tu2023unicorns,
33
+ title={How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},
34
+ author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han,
35
+ Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
36
+ journal={arXiv preprint arXiv:2311.16101},
37
+ year={2023}
38
+ }
39
+
40
+ Paper: https://arxiv.org/abs/2311.16101
41
+ """
42
+
43
+ UNICORN_HUGGINGFACE_DATASET_NAME: str = "PahaII/unicorn"
44
+
45
+ IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
46
+
47
+ SUBJECTS: List[str] = ["OODCV-VQA", "OODCV-Counterfactual", "Sketchy-VQA", "Sketchy-Challenging"]
48
+
49
+ IMG_TYPE: Dict[str, str] = {
50
+ "OODCV-VQA": "jpeg",
51
+ "OODCV-Counterfactual": "jpeg",
52
+ "Sketchy-VQA": "png",
53
+ "Sketchy-Challenging": "png",
54
+ }
55
+
56
+ name = "unicorn"
57
+ description = (
58
+ "Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
59
+ " ([paper](https://arxiv.org/abs/2311.16101))."
60
+ )
61
+ tags = ["vision-language"]
62
+
63
+ def __init__(self, subject: str):
64
+ super().__init__()
65
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
66
+ self._subject: str = subject
67
+ self._image_type: str = self.IMG_TYPE[self._subject]
68
+
69
+ def get_instances(self, output_path: str) -> List[Instance]:
70
+ images_path: str = os.path.join(output_path, "images")
71
+ ensure_directory_exists(images_path)
72
+
73
+ # There is only the test split in Unicorn benchmark
74
+ instances: List[Instance] = []
75
+ question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
76
+
77
+ # Process the test set
78
+ for row in tqdm(
79
+ load_dataset(
80
+ self.UNICORN_HUGGINGFACE_DATASET_NAME,
81
+ data_files=question_data_files,
82
+ split=TEST_SPLIT,
83
+ cache_dir=output_path,
84
+ )
85
+ ):
86
+ # Download the image
87
+ image_path: str = row["image_path"]
88
+ local_image_path: str = os.path.join(output_path, image_path)
89
+ ensure_file_downloaded(
90
+ source_url=self.IMAGE_URL.format(image_path=image_path),
91
+ target_path=local_image_path,
92
+ unpack=False,
93
+ )
94
+
95
+ content: List[MediaObject] = [
96
+ MediaObject(location=local_image_path, content_type=f"image/{self._image_type}"),
97
+ MediaObject(text=row["question"], content_type="text/plain"),
98
+ ]
99
+ answer: str = row["answer"]
100
+ instances.append(
101
+ Instance(
102
+ Input(multimedia_content=MultimediaObject(content)),
103
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
104
+ split=TEST_SPLIT,
105
+ )
106
+ )
107
+
108
+ return instances
@@ -13,7 +13,7 @@ from helm.benchmark.scenarios.scenario import (
13
13
  Scenario,
14
14
  )
15
15
  from helm.common.media_object import MediaObject, MultimediaObject
16
- from helm.common.general import ensure_directory_exists, ensure_file_downloaded
16
+ from helm.common.general import ensure_file_downloaded
17
17
 
18
18
 
19
19
  class VizWizScenario(Scenario):
@@ -60,7 +60,6 @@ class VizWizScenario(Scenario):
60
60
  def get_instances(self, output_path: str) -> List[Instance]:
61
61
  # Download the questions and annotations
62
62
  annotations_path: str = os.path.join(output_path, "annotations")
63
- ensure_directory_exists(annotations_path)
64
63
  ensure_file_downloaded(
65
64
  source_url=self.ANNOTATIONS_URL,
66
65
  target_path=annotations_path,
@@ -54,7 +54,7 @@ class VQAScenario(Scenario):
54
54
  TEST_SPLIT: "http://images.cocodataset.org/zips/test2015.zip",
55
55
  }
56
56
 
57
- name = "visual_question_answering"
57
+ name = "vqa"
58
58
  description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
59
59
  tags = ["vision-language", "visual question answering"]
60
60
 
@@ -61,7 +61,7 @@ class WMT14Scenario(Scenario):
61
61
  def get_instances(self, output_path: str) -> List[Instance]:
62
62
  with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
63
63
  subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
64
- hf_dataset: Any = load_dataset("wmt14", subset_name)
64
+ hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
65
65
  splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
66
66
 
67
67
  instances: List[Instance] = []
helm/benchmark/server.py CHANGED
@@ -70,6 +70,14 @@ def serve_benchmark_output(filename):
70
70
  return response
71
71
 
72
72
 
73
+ @app.get("/cache/output/<filename:path>")
74
+ def serve_cache_output(filename):
75
+ response = static_file(filename, root=app.config["helm.cacheoutputpath"])
76
+ response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
77
+ response.set_header("Expires", "0")
78
+ return response
79
+
80
+
73
81
  @app.get("/")
74
82
  @app.get("/<filename:path>")
75
83
  def serve_static(filename="index.html"):
@@ -87,6 +95,12 @@ def main():
87
95
  help="The location of the output path (filesystem path or URL)",
88
96
  default="benchmark_output",
89
97
  )
98
+ parser.add_argument(
99
+ "--cache-output-path",
100
+ type=str,
101
+ help="The location of the filesystem cache output folder (filesystem path or URL)",
102
+ default="prod_env/cache/output",
103
+ )
90
104
  parser.add_argument(
91
105
  "--suite",
92
106
  type=str,
@@ -99,6 +113,11 @@ def main():
99
113
  default=None,
100
114
  help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
101
115
  )
116
+ parser.add_argument(
117
+ "--jquery",
118
+ action="store_true",
119
+ help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
120
+ )
102
121
  args = parser.parse_args()
103
122
 
104
123
  if args.suite and args.release:
@@ -107,7 +126,8 @@ def main():
107
126
  # Determine the location of the static directory.
108
127
  # This is a hack: it assumes that the static directory has a physical location,
109
128
  # which is not always the case (e.g. when using zipimport).
110
- resource_path = resources.files("helm.benchmark.static").joinpath("index.html")
129
+ static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
130
+ resource_path = resources.files(static_package_name).joinpath("index.html")
111
131
  with resources.as_file(resource_path) as resource_filename:
112
132
  static_path = str(resource_filename.parent)
113
133
 
@@ -117,16 +137,19 @@ def main():
117
137
  # Output path is a URL, so set the output path base URL in the frontend to that URL
118
138
  # so that the frontend reads from that URL directly.
119
139
  app.config["helm.outputpath"] = None
140
+ # TODO: figure out helm.cacheoutputpath
120
141
  app.config["helm.outputurl"] = args.output_path
121
142
  else:
122
143
  # Output path is a location on disk, so set the output path base URL to /benchmark_output
123
144
  # and then serve files from the location on disk at that URL.
124
145
  app.config["helm.outputpath"] = path.abspath(args.output_path)
146
+ app.config["helm.cacheoutputpath"] = path.abspath(args.cache_output_path)
125
147
  app.config["helm.outputurl"] = "benchmark_output"
126
148
 
127
149
  app.config["helm.suite"] = args.suite or "latest"
128
150
  app.config["helm.release"] = args.release
129
151
 
152
+ print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
130
153
  app.run(host="0.0.0.0", port=args.port)
131
154
 
132
155