crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,103 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+
19
+
20
+ class BingoScenario(Scenario):
21
+ """
22
+ Holistic Analysis of Hallucination in GPT-4V(ision): Bias and Interference Challenges
23
+
24
+ We introduce a new benchmark, namely, the Bias and Interference Challenges in Visual Language Models (Bingo).
25
+ This benchmark is designed to evaluate and shed light on the two common types of hallucinations in visual
26
+ language models: bias and interference. Here, bias refers to the model's tendency to hallucinate certain types
27
+ of responses, possibly due to imbalance in its training data. Interference pertains to scenarios where the
28
+ judgment of GPT-4V(ision) can be disrupted due to how the text prompt is phrased or how the input image is
29
+ presented. The benchmark consists of open-ended question-answer pairs, and we employ open-ended generation
30
+ metrics for evaluation. In the experiment, we identify a notable regional bias, whereby GPT-4V(ision) is
31
+ better at interpreting Western images or images with English writing compared to images from other countries
32
+ or containing text in other languages.
33
+
34
+
35
+ @article{cui2023holistic,
36
+ title={Holistic analysis of hallucination in gpt-4v (ision): Bias and interference challenges},
37
+ author={Cui, Chenhang and Zhou, Yiyang and Yang, Xinyu and Wu, Shirley and Zhang, Linjun and
38
+ Zou, James and Yao, Huaxiu},
39
+ journal={arXiv preprint arXiv:2311.03287},
40
+ year={2023}
41
+ }
42
+
43
+ Paper: https://arxiv.org/abs/2311.03287
44
+ """
45
+
46
+ BINGO_HUGGINGFACE_DATASET_NAME: str = "PahaII/Bingo"
47
+
48
+ IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/Bingo/resolve/main/images/{image_path}?download=true"
49
+
50
+ SUBJECTS: List[str] = ["T2I", "I2I", "OCR", "Factual", "Region"]
51
+
52
+ name = "bingo"
53
+ description = (
54
+ "Evaluate multimodal models on biased and inference-challenging scenarios with five subjects"
55
+ " ([paper](https://arxiv.org/abs/2311.03287))."
56
+ )
57
+ tags = ["vision-language"]
58
+
59
+ def __init__(self, subject: str):
60
+ super().__init__()
61
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
62
+ self._subject: str = subject
63
+
64
+ def get_instances(self, output_path: str) -> List[Instance]:
65
+ images_path: str = os.path.join(output_path, "images")
66
+ ensure_directory_exists(images_path)
67
+
68
+ # There is only the test split in Unicorn benchmark
69
+ instances: List[Instance] = []
70
+ question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
71
+
72
+ # Process the test set
73
+ for row in tqdm(
74
+ load_dataset(
75
+ self.BINGO_HUGGINGFACE_DATASET_NAME,
76
+ data_files=question_data_files,
77
+ split=TEST_SPLIT,
78
+ cache_dir=output_path,
79
+ )
80
+ ):
81
+ # Download the image
82
+ image_path: str = row["image_path"]
83
+ local_image_path: str = os.path.join(output_path, image_path)
84
+ ensure_file_downloaded(
85
+ source_url=self.IMAGE_URL.format(image_path=image_path),
86
+ target_path=local_image_path,
87
+ unpack=False,
88
+ )
89
+
90
+ content: List[MediaObject] = [
91
+ MediaObject(location=local_image_path, content_type="image/png"),
92
+ MediaObject(text=row["question"], content_type="text/plain"),
93
+ ]
94
+ answer: str = row["answer"]
95
+ instances.append(
96
+ Instance(
97
+ Input(multimedia_content=MultimediaObject(content)),
98
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
99
+ split=TEST_SPLIT,
100
+ )
101
+ )
102
+
103
+ return instances
@@ -0,0 +1,92 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ ALL_SPLITS,
9
+ CORRECT_TAG,
10
+ VALID_SPLIT,
11
+ Instance,
12
+ Input,
13
+ Output,
14
+ Reference,
15
+ Scenario,
16
+ )
17
+ from helm.common.media_object import MediaObject, MultimediaObject
18
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
19
+
20
+
21
+ class HatefulMemesScenario(Scenario):
22
+ """
23
+ The Hateful Memes Challenge and Dataset is a competition and open source dataset designed to
24
+ measure progress in multimodal vision-and-language classification.
25
+
26
+ In order for AI to become a more effective tool for detecting hate speech, it must be able to understand
27
+ content the way people do: holistically. When viewing a meme, for example, we don’t think about the words
28
+ and photo independently of each other; we understand the combined meaning. This is extremely challenging
29
+ for machines, however, because it means they can’t analyze the text and the image separately. They must
30
+ combine these different modalities and understand how the meaning changes when they are presented together.
31
+
32
+ @article{kiela2020hateful,
33
+ title={The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes},
34
+ author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh,
35
+ Amanpreet and Ringshia, Pratik and Testuggine, Davide},
36
+ journal={arXiv preprint arXiv:2005.04790},
37
+ year={2020}
38
+ }
39
+
40
+ Paper: https://arxiv.org/abs/2005.04790
41
+ Website: https://ai.meta.com/tools/hatefulmemes/
42
+ """
43
+
44
+ IMAGE_URL: str = (
45
+ "https://huggingface.co/datasets/neuralcatcher/hateful_memes/resolve/main/{image_path}?download=true"
46
+ )
47
+
48
+ # This question is from the example on the website
49
+ QUESTION: str = "Is this meme mean?"
50
+
51
+ name = "hateful_memes"
52
+ description = "Identify if a meme is hateful or not ([paper](https://arxiv.org/abs/2005.04790))."
53
+ tags = ["vision-language"]
54
+
55
+ def get_instances(self, output_path: str) -> List[Instance]:
56
+ ensure_directory_exists(os.path.join(output_path, "img"))
57
+
58
+ instances: List[Instance] = []
59
+ for split in ALL_SPLITS:
60
+ for row in tqdm(
61
+ load_dataset(
62
+ "neuralcatcher/hateful_memes",
63
+ split="validation" if split == VALID_SPLIT else split,
64
+ cache_dir=output_path,
65
+ )
66
+ ):
67
+ # Download the meme
68
+ image_path: str = row["img"]
69
+ local_image_path: str = os.path.join(output_path, image_path)
70
+ ensure_file_downloaded(
71
+ source_url=self.IMAGE_URL.format(image_path=image_path),
72
+ target_path=local_image_path,
73
+ unpack=False,
74
+ )
75
+ # Some examples are missing images. Skip those for now
76
+ if not os.path.exists(local_image_path) or os.path.getsize(local_image_path) == 0:
77
+ continue
78
+
79
+ content: List[MediaObject] = [
80
+ MediaObject(location=local_image_path, content_type="image/jpeg"),
81
+ MediaObject(text=self.QUESTION, content_type="text/plain"),
82
+ ]
83
+ answer: str = "Yes" if row["label"] == 1 else "No"
84
+ instances.append(
85
+ Instance(
86
+ Input(multimedia_content=MultimediaObject(content)),
87
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
88
+ split=split,
89
+ )
90
+ )
91
+
92
+ return instances
@@ -0,0 +1,113 @@
1
+ from collections import Counter
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List, Set
4
+ import json
5
+ import os
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ ALL_SPLITS,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_file_downloaded
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class HEIMHumanEvalReference(Reference):
22
+ # The number of human annotators who gave this reference or answer.
23
+ num_human_answered: int = 0
24
+
25
+
26
+ class HEIMHumanEvalScenario(Scenario):
27
+ """
28
+ In [Holistic Evaluation of Text-To-Image Models (HEIM)](https://arxiv.org/abs/2311.04287), we evaluated 26
29
+ state-of-the-art text-to-image models using across 12 different aspects (e.g., toxicity mitigation, unbiasedness,
30
+ originality, etc.). We used human annotators through AWS Mechanical Turk to evaluate the models for some of
31
+ these aspects (see image below).
32
+ This scenario contains the AI-generated images and human annotations for the following question types:
33
+
34
+ 1. Alignment
35
+ 2. Aesthetics
36
+ 3. Clear subject
37
+ 4. Originality
38
+ 5. Photorealism
39
+
40
+ Citations:
41
+ - HEIM: https://arxiv.org/abs/2311.04287
42
+ - MS COCO: https://arxiv.org/abs/1405.0312
43
+ """
44
+
45
+ DATASET_DOWNLOAD_URL: str = (
46
+ "https://worksheets.codalab.org/rest/bundles/0x502d646c366c4f1d8c4a2ccf163b958f/contents/blob/"
47
+ )
48
+ VALID_QUESTION_TYPES: Set[str] = {"alignment", "aesthetics", "clear_subject", "originality", "photorealism"}
49
+
50
+ name = "heim_human_eval"
51
+ description = (
52
+ "Images generated by text-to-image models and human annotations for HEIM "
53
+ "([paper](https://arxiv.org/abs/2311.04287))."
54
+ )
55
+ tags = ["vision-language", "visual question answering", "image evaluation"]
56
+
57
+ def __init__(self, question_type: str):
58
+ super().__init__()
59
+ assert question_type in self.VALID_QUESTION_TYPES, f"Invalid question type: {question_type}"
60
+ self._question_type: str = question_type
61
+
62
+ def get_instances(self, output_path: str) -> List[Instance]:
63
+ # Download the dataset
64
+ output_path = os.path.join(output_path, "dataset")
65
+ ensure_file_downloaded(
66
+ source_url=self.DATASET_DOWNLOAD_URL, target_path=output_path, unpack=True, unpack_type="untar"
67
+ )
68
+
69
+ # Load the multiple-choice questions
70
+ with open(os.path.join(output_path, "questions.json")) as questions_file:
71
+ question_info: Dict = json.load(questions_file)[self._question_type]
72
+
73
+ instances: List[Instance] = []
74
+ for split in ALL_SPLITS:
75
+ annotations_split_path: str = os.path.join(output_path, f"{self._question_type}_{split}.jsonl")
76
+ with open(annotations_split_path) as f:
77
+ # Read each line/example as a JSON object
78
+ for line in f.readlines():
79
+ image_annotation: Dict = json.loads(line)
80
+ image_path: str = os.path.join(output_path, image_annotation["image_path"])
81
+ assert os.path.exists(image_path), f"Image {image_path} does not exist"
82
+
83
+ # Get the most common human answer(s) for the question
84
+ human_answers: List[str] = [str(answer) for answer in image_annotation["human_annotations"]]
85
+ human_answers_to_counts = Counter(human_answers)
86
+ max_count: int = max(human_answers_to_counts.values())
87
+ modes: List[str] = [value for value, count in human_answers_to_counts.items() if count == max_count]
88
+
89
+ content: List[MediaObject] = [MediaObject(location=image_path, content_type="image/png")]
90
+ if "prompt" in image_annotation:
91
+ # Include the prompt in the content if it exists
92
+ prompt: str = image_annotation["prompt"]
93
+ content.append(MediaObject(text=f"Description: {prompt}", content_type="text/plain"))
94
+ content.append(MediaObject(text=question_info["question"], content_type="text/plain"))
95
+
96
+ references: List[Reference] = [
97
+ HEIMHumanEvalReference(
98
+ Output(text=answer),
99
+ # The mode is the most common human answer and the reference we mark as correct
100
+ tags=[CORRECT_TAG] if value in modes else [],
101
+ num_human_answered=human_answers_to_counts[value],
102
+ )
103
+ for value, answer in question_info["choices"].items()
104
+ ]
105
+ instances.append(
106
+ Instance(
107
+ Input(multimedia_content=MultimediaObject(content)),
108
+ references=references,
109
+ split=split,
110
+ )
111
+ )
112
+
113
+ return instances
@@ -0,0 +1,55 @@
1
+ from typing import List
2
+ import os
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ CORRECT_TAG,
6
+ TEST_SPLIT,
7
+ Instance,
8
+ Input,
9
+ Output,
10
+ Reference,
11
+ Scenario,
12
+ )
13
+ from helm.common.media_object import MediaObject, MultimediaObject
14
+
15
+
16
+ class Chart2CSVScenario(Scenario):
17
+ """
18
+ Chart to CSV
19
+ """
20
+
21
+ name = "chart2csv"
22
+ description = "Convert a chart to CSV."
23
+ tags = ["vision-language", "image2structure"]
24
+
25
+ def get_instances(self, output_path: str) -> List[Instance]:
26
+ assert os.path.exists(output_path), f"Dataset does not exist at {output_path}"
27
+ instances: List[Instance] = []
28
+
29
+ charts_path: str = os.path.join(output_path, "charts")
30
+ ground_truths_path: str = os.path.join(output_path, "groundtruth")
31
+
32
+ for chart_file in os.listdir(charts_path):
33
+ if not chart_file.endswith(".png"):
34
+ continue
35
+
36
+ chart_path: str = os.path.join(charts_path, chart_file)
37
+ ground_truth_file: str = chart_file.replace(".png", ".psv")
38
+ ground_truth_path: str = os.path.join(ground_truths_path, ground_truth_file)
39
+ assert os.path.exists(ground_truth_path), f"Ground truth does not exist at {ground_truth_path}"
40
+
41
+ content: List[MediaObject] = [
42
+ MediaObject(location=chart_path, content_type="image/png"),
43
+ ]
44
+ with open(ground_truth_path, "r") as file:
45
+ ground_truth: str = file.read().replace("|", ",")
46
+
47
+ instances.append(
48
+ Instance(
49
+ Input(multimedia_content=MultimediaObject(content)),
50
+ references=[Reference(Output(text=ground_truth), tags=[CORRECT_TAG])],
51
+ split=TEST_SPLIT,
52
+ )
53
+ )
54
+
55
+ return instances
@@ -0,0 +1,214 @@
1
+ import os.path
2
+ from typing import List, Optional, Dict, Any
3
+ from abc import abstractmethod
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ from helm.benchmark.scenarios.scenario import (
9
+ CORRECT_TAG,
10
+ ASSET_NAME_TAG,
11
+ ASSET_PATH_TAG,
12
+ TEST_SPLIT,
13
+ VALID_SPLIT,
14
+ Instance,
15
+ Input,
16
+ Output,
17
+ Reference,
18
+ Scenario,
19
+ )
20
+ from helm.common.media_object import MediaObject, MultimediaObject
21
+ from helm.common.general import ensure_directory_exists
22
+ from helm.common.hierarchical_logger import hlog
23
+
24
+ PROCESSED: str = "processed"
25
+
26
+
27
+ class Image2StructureScenario(Scenario):
28
+ BASE_PROMPT: str
29
+ HUGGINGFACE_DATASET_NAME: str
30
+ SUBSETS: List[str]
31
+
32
+ name: str
33
+ description: str
34
+ tags = ["vision-language"]
35
+
36
+ helm_split_to_huggingface_split = {
37
+ TEST_SPLIT: "test",
38
+ VALID_SPLIT: "validation",
39
+ }
40
+
41
+ def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
42
+ super().__init__()
43
+ assert subset in self.SUBSETS, f"Invalid subset: {subset}"
44
+ self._subset: str = subset
45
+ self._recompile_prompt: bool = recompile_prompt
46
+ self._split: str = split
47
+ self._output_path: Optional[str] = None
48
+
49
+ def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
50
+ # By default, there are no assets
51
+ del row["assets"]
52
+ row["assets_paths"] = []
53
+ row["assets_names"] = []
54
+ return row
55
+
56
+ def build_prompt(self, row: Dict[str, Any]) -> str:
57
+ return self.BASE_PROMPT
58
+
59
+ @abstractmethod
60
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
61
+ """Compile the prompt, should save the image and return the text extracted from the image"""
62
+ pass
63
+
64
+ def finalize(self, row: Dict[str, Any]) -> None:
65
+ """Perform cleanup operations after the instance has been generated."""
66
+ pass
67
+
68
+ def get_instances(self, output_path: str) -> List[Instance]:
69
+ """Get the instances for the scenario. This compile_and_save method should be implemented by the subclass.
70
+ Additionally, the subclass should implement the preprocess_row method if any preprocessing is needed.
71
+
72
+ For each instance, the following steps are performed:
73
+ 1. Preprocess the row
74
+ 2. Save the image locally
75
+ - 2.a. If we don't want to recompile the prompt, save the image directly
76
+ - 2.b. If we want to recompile the prompt, compile the structure and save the image
77
+ 3. Create the prompt
78
+ 4. Create the multimedia content
79
+ 5. Create the reference
80
+ 6. Finalize the Instance
81
+
82
+ Args:
83
+ output_path (str): The path where the instances will be saved
84
+
85
+ Returns:
86
+ List[Instance]: The list of instances
87
+ """
88
+ self._output_path = output_path
89
+ images_path: str = os.path.join(output_path, "data/images", self._subset)
90
+ assets_path: str = os.path.join(output_path, "data/assets", self._subset)
91
+ ensure_directory_exists(images_path)
92
+ ensure_directory_exists(assets_path)
93
+
94
+ instances: List[Instance] = []
95
+
96
+ # Process the desired set of instances
97
+ for row in tqdm(
98
+ load_dataset(
99
+ self.HUGGINGFACE_DATASET_NAME,
100
+ self._subset,
101
+ split=self.helm_split_to_huggingface_split[self._split],
102
+ cache_dir=output_path,
103
+ )
104
+ ):
105
+ question_uuid: str = str(row["uuid"]).replace('"', "")
106
+ if row["category"][1:-1] != self._subset:
107
+ hlog(
108
+ f"Skipping instance {question_uuid} as it belong in category"
109
+ f" {row['category']} and not {self._subset}"
110
+ )
111
+ continue
112
+
113
+ # Step 1: Preprocess the row
114
+ row = self.preprocess_row(row, assets_path)
115
+
116
+ # Step 2: Save the image locally
117
+ image_path: str = os.path.join(images_path, f"{question_uuid}.png")
118
+ if not os.path.exists(image_path):
119
+ if not self._recompile_prompt: # 2.a
120
+ row["image"].save(image_path)
121
+ else: # 2.b
122
+ if "structure" not in row:
123
+ raise ValueError("Cannot recompile prompt without structure")
124
+ structure: str = row["structure"]
125
+ text: str = self.compile_and_save(structure, assets_path, image_path)
126
+ row["text"] = text
127
+
128
+ # Step 3: Create the prompt
129
+ prompt: str = self.build_prompt(row)
130
+
131
+ # Step 4: Create the multimedia content
132
+ image_object = MediaObject(location=image_path, content_type="image/png")
133
+ content: List[MediaObject] = [
134
+ MediaObject(text=prompt, content_type="text/plain"),
135
+ image_object,
136
+ ]
137
+
138
+ # Step 5: Create the references
139
+ # 5.a Create the reference containing the structure and the associated image.
140
+ reference: Reference
141
+ if "structure" in row:
142
+ multimedia_object: MultimediaObject
143
+ if os.path.exists(row["structure"]):
144
+ # 5.a.1 The structure is a path, therefore represent it as a multimedia object
145
+ # containing the files used to compile the structure (such as a repository
146
+ # containing the HTML, CSS, and JavaScript files used to generate a webpage)
147
+ multimedia_object = MultimediaObject(
148
+ [image_object, MediaObject(location=row["structure"], content_type="path/path")]
149
+ )
150
+ elif row["structure"] == PROCESSED:
151
+ # 5.a.2 The structure has been processed and is no longer present in the row
152
+ # This can be the case if the structure is a base64 encoding of an archive that
153
+ # has been extracted to a temporary path and processed but the path is no longer
154
+ # existing (deleted after the processing is done)
155
+ multimedia_object = MultimediaObject([image_object])
156
+ else:
157
+ # 5.a.3 The structure is not a path, therefore it is directly a valid string
158
+ # representing the structure (such as LaTeX code)
159
+ multimedia_object = MultimediaObject([image_object])
160
+ reference = Reference(
161
+ output=Output(text=row["text"], multimedia_content=multimedia_object),
162
+ tags=[CORRECT_TAG],
163
+ )
164
+ else:
165
+ if "text" in row:
166
+ reference = Reference(
167
+ output=Output(text=row["text"], multimedia_content=MultimediaObject([image_object])),
168
+ tags=[CORRECT_TAG],
169
+ )
170
+ else:
171
+ reference = Reference(
172
+ output=Output(multimedia_content=MultimediaObject([image_object])), tags=[CORRECT_TAG]
173
+ )
174
+ references: List[Reference] = [reference]
175
+
176
+ # 5.b Create the reference containing the assets
177
+ if len(row["assets_paths"]) > 0:
178
+ assets_paths_reference: Reference = Reference(
179
+ output=Output(
180
+ text=", ".join(
181
+ row["assets_paths"]
182
+ ), # TODO: This is for debugging purposes (to show in the frontend)
183
+ multimedia_content=MultimediaObject(
184
+ [
185
+ MediaObject(location=asset, content_type=f"image/{asset.split('.')[-1].lower()}")
186
+ for asset in row["assets_paths"]
187
+ ]
188
+ ),
189
+ ),
190
+ tags=[ASSET_PATH_TAG],
191
+ )
192
+ references.append(assets_paths_reference)
193
+ assets_names_reference: Reference = Reference(
194
+ output=Output(
195
+ text=", ".join(
196
+ row["assets_names"]
197
+ ), # TODO: This is for debugging purposes (to show in the frontend)
198
+ multimedia_content=MultimediaObject(
199
+ [MediaObject(text=asset, content_type="text/plain") for asset in row["assets_names"]]
200
+ ),
201
+ ),
202
+ tags=[ASSET_NAME_TAG],
203
+ )
204
+ references.append(assets_names_reference)
205
+
206
+ # Step 6: Finalize the Instance
207
+ self.finalize(row)
208
+ instance = Instance(
209
+ input=Input(multimedia_content=MultimediaObject(content)), references=references, split=self._split
210
+ )
211
+ instances.append(instance)
212
+
213
+ assert len(instances) > 0, f"No instances found for subject {self._subset}"
214
+ return instances
@@ -0,0 +1,25 @@
1
+ from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
+ from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
3
+ latex_to_image,
4
+ strip_unnecessary_latex_parts,
5
+ )
6
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
7
+
8
+
9
+ class LatexScenario(Image2StructureScenario):
10
+ BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
11
+ HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
12
+ SUBSETS = ["equation", "table", "plot", "algorithm"]
13
+
14
+ name = "image2latex"
15
+ description = "Evaluate multimodal models on Latex generation to recreate a provided image"
16
+
17
+ def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
18
+ super().__init__(subset, recompile_prompt, split)
19
+
20
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
21
+ image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
22
+ image.save(destination_path)
23
+ assert "latex_code" in infos
24
+ text: str = strip_unnecessary_latex_parts(infos["latex_code"])
25
+ return text
@@ -0,0 +1,20 @@
1
+ from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
3
+
4
+
5
+ class MusicSheetScenario(Image2StructureScenario):
6
+ BASE_PROMPT = (
7
+ "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasible possible.\n" # noqa: E501
8
+ "This music sheet was created by me, and I would like to recreate it using Lilypond."
9
+ )
10
+ HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
11
+ SUBSETS = ["music"]
12
+
13
+ name = "image2musicsheet"
14
+ description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
15
+
16
+ def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
17
+ super().__init__(subset, recompile_prompt, split)
18
+
19
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
20
+ raise Exception("Music sheets have no ground truth, compilation is not possible")