crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,129 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class SEEDBenchScenario(Scenario):
21
+ """
22
+ SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension
23
+
24
+ Based on powerful Large Language Models (LLMs), recent generative Multimodal
25
+ Large Language Models (MLLMs) have gained prominence as a pivotal research area.
26
+ In Seed-Bench, we address the evaluation of generative comprehension in MLLMs
27
+ as a preliminary step towards a comprehensive assessment of generative models.
28
+ SEED-Bench consists of 19K multiple choice questions with accurate human annotations
29
+ (x 6 larger than existing benchmarks), which spans 12 evaluation dimensions
30
+ including the comprehension of both the image and video modality. We select 9
31
+ evaluation aspects that take image as the input. In the benchmark,
32
+ Multiple-choice questions with groundtruth options derived from human
33
+ annotation enables an objective and efficient assessment of model performance,
34
+ eliminating the need for human or GPT intervention during evaluation. We employ
35
+ the multiple-choice metric for evaluating the performance of models.
36
+
37
+ @article{li2023seed,
38
+ title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
39
+ author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
40
+ journal={arXiv preprint arXiv:2307.16125},
41
+ year={2023}
42
+ }
43
+
44
+ Paper: https://arxiv.org/abs/2307.16125
45
+ """
46
+
47
+ SEED_BENCH_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/SEED-Bench"
48
+
49
+ SUBJECTS: Dict[str, int] = {
50
+ "scene-understanding": 1,
51
+ "instance-identity": 2,
52
+ "instance-attributes": 3,
53
+ "instance-location": 4,
54
+ "instances-counting": 5,
55
+ "spatial-relation": 6,
56
+ "instance-interaction": 7,
57
+ "visual-reasoning": 8,
58
+ "text-understanding": 9,
59
+ }
60
+
61
+ name = "seed_bench"
62
+ description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2307.16125))."
63
+ tags = ["vision-language"]
64
+
65
+ def __init__(self, subject: str):
66
+ super().__init__()
67
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
68
+ self._subject: str = subject
69
+
70
+ def get_subject_name(self, subject_name: str) -> str:
71
+ return "-".join(subject_name.lower().split())
72
+
73
+ def get_instances(self, output_path: str) -> List[Instance]:
74
+ images_path: str = os.path.join(output_path, "images")
75
+ ensure_directory_exists(images_path)
76
+
77
+ # There is only the test split in Unicorn benchmark
78
+ instances: List[Instance] = []
79
+ # Process the test set
80
+ # Two open-ended generation instances and
81
+ # one multi-choice generation instance per row
82
+ for row in tqdm(
83
+ load_dataset(
84
+ self.SEED_BENCH_HUGGINGFACE_DATASET_NAME,
85
+ split=TEST_SPLIT,
86
+ cache_dir=output_path,
87
+ )
88
+ ):
89
+ question_type_key: str = self.get_subject_name(self._subject)
90
+ if row["question_type_id"] != self.SUBJECTS[question_type_key]:
91
+ continue
92
+ question_id: str = row["question_id"]
93
+ # Download the image
94
+ # Save the image locally
95
+ image_path: str = os.path.join(images_path, f"{question_id}.png")
96
+ if not os.path.exists(image_path):
97
+ # some images are CMYK mode, convert to RGB.
98
+ row["image"][0].convert("RGB").save(image_path, "PNG", optimize=True)
99
+
100
+ # Add the references
101
+ references: List[Reference] = []
102
+ question: str = row["question"]
103
+ answer: str
104
+ content: List[MediaObject]
105
+ options: List[str] = [row["choice_a"], row["choice_b"], row["choice_c"], row["choice_d"]]
106
+ answer = row["answer"].strip()
107
+ # The given correct answer is a letter, but we need an index
108
+ correct_answer_index: int = ord(answer) - ord("A")
109
+ # The options are originally appended to the question
110
+
111
+ for i, option in enumerate(options):
112
+ reference: Reference
113
+ is_correct: bool = i == correct_answer_index
114
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
115
+ references.append(reference)
116
+
117
+ content = [
118
+ MediaObject(location=image_path, content_type="image/png"),
119
+ MediaObject(text=question, content_type="text/plain"),
120
+ ]
121
+ instances.append(
122
+ Instance(
123
+ Input(multimedia_content=MultimediaObject(content)),
124
+ references=references,
125
+ split=TEST_SPLIT,
126
+ )
127
+ )
128
+
129
+ return instances
@@ -0,0 +1,108 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+
19
+
20
+ class UnicornScenario(Scenario):
21
+ """
22
+ How Many Unicorns are in this Image? A Safety Evaluation Benchmark of Vision LLMs
23
+
24
+ We shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation
25
+ suite Unicorn, covering both out-of-distribution (OOD) generalization and adversarial robustness. For the OOD
26
+ evaluation, we present two novel VQA datasets --- OODCV-VQA and Sketchy-VQA, each with one variant, designed
27
+ to test model performance under challenging conditions. In the OOD scenario, questions are matched with
28
+ boolean or numerical answers, and we use exact match metrics for evaluation. When comparing OOD Sketchy-VQA
29
+ with its synthesized in-distribution counterpart, we found an average model output F1 drop of 8.9%,
30
+ highlighting the challenging nature of the OOD scenario in the Unicorn benchmark.
31
+
32
+ @article{tu2023unicorns,
33
+ title={How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},
34
+ author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han,
35
+ Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
36
+ journal={arXiv preprint arXiv:2311.16101},
37
+ year={2023}
38
+ }
39
+
40
+ Paper: https://arxiv.org/abs/2311.16101
41
+ """
42
+
43
+ UNICORN_HUGGINGFACE_DATASET_NAME: str = "PahaII/unicorn"
44
+
45
+ IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
46
+
47
+ SUBJECTS: List[str] = ["OODCV-VQA", "OODCV-Counterfactual", "Sketchy-VQA", "Sketchy-Challenging"]
48
+
49
+ IMG_TYPE: Dict[str, str] = {
50
+ "OODCV-VQA": "jpeg",
51
+ "OODCV-Counterfactual": "jpeg",
52
+ "Sketchy-VQA": "png",
53
+ "Sketchy-Challenging": "png",
54
+ }
55
+
56
+ name = "unicorn"
57
+ description = (
58
+ "Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
59
+ " ([paper](https://arxiv.org/abs/2311.16101))."
60
+ )
61
+ tags = ["vision-language"]
62
+
63
+ def __init__(self, subject: str):
64
+ super().__init__()
65
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
66
+ self._subject: str = subject
67
+ self._image_type: str = self.IMG_TYPE[self._subject]
68
+
69
+ def get_instances(self, output_path: str) -> List[Instance]:
70
+ images_path: str = os.path.join(output_path, "images")
71
+ ensure_directory_exists(images_path)
72
+
73
+ # There is only the test split in Unicorn benchmark
74
+ instances: List[Instance] = []
75
+ question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
76
+
77
+ # Process the test set
78
+ for row in tqdm(
79
+ load_dataset(
80
+ self.UNICORN_HUGGINGFACE_DATASET_NAME,
81
+ data_files=question_data_files,
82
+ split=TEST_SPLIT,
83
+ cache_dir=output_path,
84
+ )
85
+ ):
86
+ # Download the image
87
+ image_path: str = row["image_path"]
88
+ local_image_path: str = os.path.join(output_path, image_path)
89
+ ensure_file_downloaded(
90
+ source_url=self.IMAGE_URL.format(image_path=image_path),
91
+ target_path=local_image_path,
92
+ unpack=False,
93
+ )
94
+
95
+ content: List[MediaObject] = [
96
+ MediaObject(location=local_image_path, content_type=f"image/{self._image_type}"),
97
+ MediaObject(text=row["question"], content_type="text/plain"),
98
+ ]
99
+ answer: str = row["answer"]
100
+ instances.append(
101
+ Instance(
102
+ Input(multimedia_content=MultimediaObject(content)),
103
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
104
+ split=TEST_SPLIT,
105
+ )
106
+ )
107
+
108
+ return instances
@@ -0,0 +1,107 @@
1
+ from typing import Dict, List, Set
2
+ import json
3
+ import os
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ CORRECT_TAG,
7
+ TRAIN_SPLIT,
8
+ VALID_SPLIT,
9
+ Instance,
10
+ Input,
11
+ Output,
12
+ Reference,
13
+ Scenario,
14
+ )
15
+ from helm.common.media_object import MediaObject, MultimediaObject
16
+ from helm.common.general import ensure_file_downloaded
17
+
18
+
19
+ class VizWizScenario(Scenario):
20
+ """
21
+ VizWiz is a real-world visual question answering dataset consisting of questions
22
+ asked by people who are blind. It originates from a natural visual question answering
23
+ setting where blind people each took an image and recorded a spoken question about it,
24
+ together with 10 crowdsourced answers per visual question.
25
+
26
+ Version as of January 1, 2020:
27
+
28
+ - 20,523 training image/question pairs
29
+ - 205,230 training answer/answer confidence pairs
30
+ - 4,319 validation image/question pairs
31
+ - 43,190 validation answer/answer confidence pairs
32
+
33
+ where answer confidences are one of {"yes", "maybe", "no"}.
34
+
35
+ Answers are publicly shared for the train and validation splits and hidden for the test split.
36
+
37
+ Paper: https://arxiv.org/abs/1802.08218
38
+ Website: https://vizwiz.org/tasks-and-datasets/vqa
39
+ """
40
+
41
+ # Annotations are not available for the test set
42
+ ANNOTATIONS_URL: str = "https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip"
43
+ SPLIT_TO_ANNOTATIONS_FILE: Dict[str, str] = {
44
+ TRAIN_SPLIT: "train.json",
45
+ VALID_SPLIT: "val.json",
46
+ }
47
+
48
+ SPLIT_TO_IMAGES: Dict[str, str] = {
49
+ TRAIN_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip",
50
+ VALID_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip",
51
+ }
52
+
53
+ name = "viz_wiz"
54
+ description = (
55
+ "Real-world VQA dataset consisting of questions asked by "
56
+ "people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
57
+ )
58
+ tags = ["vision-language", "visual question answering"]
59
+
60
+ def get_instances(self, output_path: str) -> List[Instance]:
61
+ # Download the questions and annotations
62
+ annotations_path: str = os.path.join(output_path, "annotations")
63
+ ensure_file_downloaded(
64
+ source_url=self.ANNOTATIONS_URL,
65
+ target_path=annotations_path,
66
+ unpack=True,
67
+ unpack_type="unzip",
68
+ )
69
+
70
+ instances: List[Instance] = []
71
+ for split in [TRAIN_SPLIT, VALID_SPLIT]:
72
+ # Download the images for the split
73
+ images_path: str = os.path.join(output_path, split)
74
+ ensure_file_downloaded(
75
+ source_url=self.SPLIT_TO_IMAGES[split],
76
+ target_path=images_path,
77
+ unpack=True,
78
+ unpack_type="unzip",
79
+ )
80
+
81
+ annotations_split_path: str = os.path.join(annotations_path, self.SPLIT_TO_ANNOTATIONS_FILE[split])
82
+ with open(annotations_split_path) as f:
83
+ for image_annotation in json.load(f):
84
+ image_path: str = os.path.join(images_path, image_annotation["image"])
85
+ assert os.path.exists(image_path), f"Image {image_path} does not exist"
86
+
87
+ content: List[MediaObject] = [
88
+ MediaObject(location=image_path, content_type="image/jpeg"),
89
+ MediaObject(text=image_annotation["question"], content_type="text/plain"),
90
+ ]
91
+ deduped_answers: Set[str] = {
92
+ answer_json["answer"]
93
+ for answer_json in image_annotation["answers"]
94
+ if answer_json["answer_confidence"] == "yes"
95
+ }
96
+
97
+ instances.append(
98
+ Instance(
99
+ Input(multimedia_content=MultimediaObject(content)),
100
+ references=[
101
+ Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in deduped_answers
102
+ ],
103
+ split=split,
104
+ )
105
+ )
106
+
107
+ return instances
@@ -54,7 +54,7 @@ class VQAScenario(Scenario):
54
54
  TEST_SPLIT: "http://images.cocodataset.org/zips/test2015.zip",
55
55
  }
56
56
 
57
- name = "visual_question_answering"
57
+ name = "vqa"
58
58
  description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
59
59
  tags = ["vision-language", "visual question answering"]
60
60
 
@@ -1,6 +1,6 @@
1
1
  from typing import List, Any
2
2
  from datasets import load_dataset
3
- from helm.common.hierarchical_logger import hlog
3
+ from helm.common.hierarchical_logger import htrack_block
4
4
  from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
5
5
 
6
6
 
@@ -59,26 +59,26 @@ class WMT14Scenario(Scenario):
59
59
  return deduplicated_dataset
60
60
 
61
61
  def get_instances(self, output_path: str) -> List[Instance]:
62
- hlog("Loading the HuggingFace dataset. The first time could take several minutes.")
63
- subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
64
- hf_dataset: Any = load_dataset("wmt14", subset_name)
65
- splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
62
+ with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
63
+ subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
64
+ hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
65
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
66
66
 
67
67
  instances: List[Instance] = []
68
- hlog("Generating instances")
69
- # Some training sets are too large, so we will only take a random subset of it.
70
- hf_dataset["train"] = hf_dataset["train"].shuffle(seed=42)[:MAX_TRAIN_INSTANCES]
71
- hf_dataset["train"]["translation"] = self._deduplicate(hf_dataset["train"]["translation"])
72
- for example in hf_dataset["train"]["translation"]:
73
- source_sentence: str = example[self.source_language]
74
- target_sentence: str = example[self.target_language]
75
- instances.append(
76
- Instance(
77
- input=Input(text=source_sentence),
78
- references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
79
- split="train",
68
+ with htrack_block("Generating instances"):
69
+ # Some training sets are too large, so we will only take a random subset of it.
70
+ hf_dataset["train"] = hf_dataset["train"].shuffle(seed=42)[:MAX_TRAIN_INSTANCES]
71
+ hf_dataset["train"]["translation"] = self._deduplicate(hf_dataset["train"]["translation"])
72
+ for example in hf_dataset["train"]["translation"]:
73
+ source_sentence: str = example[self.source_language]
74
+ target_sentence: str = example[self.target_language]
75
+ instances.append(
76
+ Instance(
77
+ input=Input(text=source_sentence),
78
+ references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
79
+ split="train",
80
+ )
80
81
  )
81
- )
82
82
 
83
83
  # No special handling needed for validation or test.
84
84
  for split_name in ["validation", "test"]:
helm/benchmark/server.py CHANGED
@@ -5,10 +5,15 @@ Starts a local HTTP server to display benchmarking assets.
5
5
 
6
6
  import argparse
7
7
  import importlib_resources as resources
8
+ import json
8
9
  from os import path
9
10
  import urllib
10
11
 
11
- from bottle import Bottle, static_file
12
+ from bottle import Bottle, static_file, HTTPResponse
13
+ import yaml
14
+
15
+ from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
16
+ from helm.common.general import serialize_dates
12
17
 
13
18
 
14
19
  app = Bottle()
@@ -28,6 +33,35 @@ def serve_config():
28
33
  )
29
34
 
30
35
 
36
+ # Shim for running helm-server for old suites from old version of helm-summarize
37
+ # that do not contain schema.json.
38
+ #
39
+ # The HELM web frontend expects to find a schema.json at /benchmark_output/runs/<version>/schema.json
40
+ # which is produced by the new version of helm-summarize but not the old version.
41
+ # When serving a suite produced by the old version of helm-summarize, the schena.json will be missing.
42
+ # This shim supports those suites by serving a schena.json that is dynamically computed from schema_classic.yaml
43
+ #
44
+ # We will remove this in a few months after most users have moved to the new version of helm-summarize.
45
+ #
46
+ # TODO(2024-03-01): Remove this.
47
+ @app.get("/benchmark_output/<runs_or_releases:re:runs|releases>/<version>/schema.json")
48
+ def server_schema(runs_or_releases, version):
49
+ relative_schema_path = path.join(runs_or_releases, version, "schema.json")
50
+ absolute_schema_path = path.join(app.config["helm.outputpath"], relative_schema_path)
51
+ if path.isfile(absolute_schema_path):
52
+ response = static_file(relative_schema_path, root=app.config["helm.outputpath"])
53
+ else:
54
+ # Suite does not contain schema.json
55
+ # Fall back to schema_classic.yaml from the static directory
56
+ classic_schema_path = path.join(app.config["helm.staticpath"], SCHEMA_CLASSIC_YAML_FILENAME)
57
+ with open(classic_schema_path, "r") as f:
58
+ response = HTTPResponse(json.dumps(yaml.safe_load(f), indent=2, default=serialize_dates))
59
+ response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
60
+ response.set_header("Expires", "0")
61
+ response.content_type = "application/json"
62
+ return response
63
+
64
+
31
65
  @app.get("/benchmark_output/<filename:path>")
32
66
  def serve_benchmark_output(filename):
33
67
  response = static_file(filename, root=app.config["helm.outputpath"])
@@ -36,6 +70,14 @@ def serve_benchmark_output(filename):
36
70
  return response
37
71
 
38
72
 
73
+ @app.get("/cache/output/<filename:path>")
74
+ def serve_cache_output(filename):
75
+ response = static_file(filename, root=app.config["helm.cacheoutputpath"])
76
+ response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
77
+ response.set_header("Expires", "0")
78
+ return response
79
+
80
+
39
81
  @app.get("/")
40
82
  @app.get("/<filename:path>")
41
83
  def serve_static(filename="index.html"):
@@ -53,6 +95,12 @@ def main():
53
95
  help="The location of the output path (filesystem path or URL)",
54
96
  default="benchmark_output",
55
97
  )
98
+ parser.add_argument(
99
+ "--cache-output-path",
100
+ type=str,
101
+ help="The location of the filesystem cache output folder (filesystem path or URL)",
102
+ default="prod_env/cache/output",
103
+ )
56
104
  parser.add_argument(
57
105
  "--suite",
58
106
  type=str,
@@ -65,6 +113,11 @@ def main():
65
113
  default=None,
66
114
  help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
67
115
  )
116
+ parser.add_argument(
117
+ "--jquery",
118
+ action="store_true",
119
+ help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
120
+ )
68
121
  args = parser.parse_args()
69
122
 
70
123
  if args.suite and args.release:
@@ -73,7 +126,8 @@ def main():
73
126
  # Determine the location of the static directory.
74
127
  # This is a hack: it assumes that the static directory has a physical location,
75
128
  # which is not always the case (e.g. when using zipimport).
76
- resource_path = resources.files("helm.benchmark.static").joinpath("index.html")
129
+ static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
130
+ resource_path = resources.files(static_package_name).joinpath("index.html")
77
131
  with resources.as_file(resource_path) as resource_filename:
78
132
  static_path = str(resource_filename.parent)
79
133
 
@@ -83,16 +137,19 @@ def main():
83
137
  # Output path is a URL, so set the output path base URL in the frontend to that URL
84
138
  # so that the frontend reads from that URL directly.
85
139
  app.config["helm.outputpath"] = None
140
+ # TODO: figure out helm.cacheoutputpath
86
141
  app.config["helm.outputurl"] = args.output_path
87
142
  else:
88
143
  # Output path is a location on disk, so set the output path base URL to /benchmark_output
89
144
  # and then serve files from the location on disk at that URL.
90
145
  app.config["helm.outputpath"] = path.abspath(args.output_path)
146
+ app.config["helm.cacheoutputpath"] = path.abspath(args.cache_output_path)
91
147
  app.config["helm.outputurl"] = "benchmark_output"
92
148
 
93
149
  app.config["helm.suite"] = args.suite or "latest"
94
150
  app.config["helm.release"] = args.release
95
151
 
152
+ print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
96
153
  app.run(host="0.0.0.0", port=args.port)
97
154
 
98
155
 
@@ -2,6 +2,8 @@ import re
2
2
  import subprocess
3
3
  from typing import Mapping, Set, Union
4
4
 
5
+ from retrying import retry
6
+
5
7
  from helm.common.optional_dependencies import handle_module_not_found_error
6
8
 
7
9
  try:
@@ -67,6 +69,11 @@ def submit_slurm_job(command: str, slurm_args: Mapping[str, Union[str, int]]) ->
67
69
  return slurm.sbatch(command)
68
70
 
69
71
 
72
+ @retry(
73
+ wait_incrementing_start=5 * 1000, # 5 seconds
74
+ wait_incrementing_increment=5 * 1000, # 5 seconds
75
+ stop_max_attempt_number=5,
76
+ )
70
77
  def get_slurm_job_state(job_id: int) -> str:
71
78
  """Get the state of a Slurm job."""
72
79
  try:
@@ -80,6 +87,11 @@ def get_slurm_job_state(job_id: int) -> str:
80
87
  return search_result.group(1)
81
88
 
82
89
 
90
+ @retry(
91
+ wait_incrementing_start=5 * 1000, # 5 seconds
92
+ wait_incrementing_increment=5 * 1000, # 5 seconds
93
+ stop_max_attempt_number=5,
94
+ )
83
95
  def cancel_slurm_job(job_id: int) -> None:
84
96
  """Cancel a Slurm job."""
85
97
  try: