crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,225 @@
1
+ from typing import Dict, List, Any
2
+
3
+ from helm.benchmark.scenarios.scenario import VALID_SPLIT
4
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
5
+ Image2StructureScenario,
6
+ PROCESSED,
7
+ )
8
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
9
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
10
+ save_random_screenshot,
11
+ ScreenshotOptions,
12
+ )
13
+ from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
14
+ from helm.common.general import ensure_directory_exists
15
+ from helm.common.optional_dependencies import handle_module_not_found_error
16
+
17
+ try:
18
+ from html2text import HTML2Text
19
+ except ModuleNotFoundError as e:
20
+ handle_module_not_found_error(e, suggestions=["image2structure"])
21
+
22
+
23
+ import base64
24
+ import os
25
+ import threading
26
+ import tarfile
27
+ import shutil
28
+ import time
29
+ import pickle
30
+
31
+
32
+ def extract_repo(base64_encoding: str, repo_path: str) -> None:
33
+ # Decode the base64 string which corresponds to an archive
34
+ # and extract the files to the repo_path
35
+ try:
36
+ archive = base64.b64decode(base64_encoding)
37
+ # Write to .tar file
38
+ with open(os.path.join(repo_path, "repo.tar.gz"), "wb") as f:
39
+ f.write(archive)
40
+ # Extract
41
+ with tarfile.open(os.path.join(repo_path, "repo.tar.gz"), "r:gz") as tar:
42
+ tar.extractall(repo_path)
43
+ except Exception as e:
44
+ raise ValueError(f"Failed to decode and extract the base64 archive: {e}")
45
+
46
+
47
+ def list_assets(repo_path: str, extensions: List[str]) -> List[str]:
48
+ asset_paths: List[str] = []
49
+ for root, _, files in os.walk(repo_path):
50
+ for file in files:
51
+ if file.split(".")[-1].lower() in extensions:
52
+ # Remove repo_path from the file path
53
+ # ignore paths that start with a dot or in _site
54
+ if not root.startswith(os.path.join(repo_path, "_site")) and not root.startswith(
55
+ os.path.join(repo_path, ".")
56
+ ):
57
+ asset_paths.append(os.path.relpath(os.path.join(root, file), repo_path))
58
+ return asset_paths
59
+
60
+
61
+ def serve_and_take_screenshot(
62
+ repo_path: str,
63
+ destination_path: str,
64
+ screenshot_options: ScreenshotOptions = ScreenshotOptions(),
65
+ max_tries: int = 5,
66
+ ) -> Dict[str, Any]:
67
+ # Start the Jekyll server
68
+ # Select a unique port per thread
69
+ port: int = 4000 + int(threading.get_ident()) % 1000
70
+ server = JekyllServer(repo_path, port=port, verbose=False)
71
+ success: bool = server.start()
72
+ if not success:
73
+ # This runs on examples that are not expected to fail
74
+ server.stop()
75
+ raise ValueError(f"Jekyll server failed to start: {repo_path}")
76
+
77
+ # Take a screenshot of a random page
78
+ success = False
79
+ error: Exception
80
+ for _ in range(max_tries):
81
+ try:
82
+ infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
83
+ success = True
84
+ break
85
+ except Exception as e:
86
+ if "net::ERR_CONNECTION_REFUSED" in str(e):
87
+ error = e
88
+ server.stop()
89
+ time.sleep(0.5)
90
+ server.start()
91
+ time.sleep(0.5)
92
+ else:
93
+ # Do not retry
94
+ break
95
+ if not success:
96
+ raise ValueError(f"Failed to take a screenshot: {error}")
97
+
98
+ # Stop the server
99
+ server.stop()
100
+ time.sleep(0.1)
101
+
102
+ return infos
103
+
104
+
105
+ class WebpageScenario(Image2StructureScenario):
106
+ BASE_PROMPT = (
107
+ "Please generate the source code to generate a webpage that looks like this image as much as feasibly possible.\n" # noqa: E501
108
+ "You should output a json object associating each file name with its content.\n\n"
109
+ "Here is a simple example of the expected structure (that does not correspond to the image)."
110
+ " In this example, 3 files are created: index.html, style.css and script.js.\n"
111
+ "[\n"
112
+ " {\n"
113
+ ' "filename": "index.html",\n'
114
+ ' "content": "<!DOCTYPE html>\\n<html>\\n<head>\\n<title>Title of the document</title>\\n</head>\\n<body>\\n\\n<p>Content of the document......</p>\\n\\n</body>\\n</html>"\n' # noqa: E501
115
+ " },\n"
116
+ " {\n"
117
+ ' "filename": "style.css",\n'
118
+ ' "content": "body {\\n background-color: lightblue;\\n}\\nh1 {\\n color: white;\\n text-align: center;\\n}"\n' # noqa: E501
119
+ " },\n"
120
+ " {\n"
121
+ ' "filename": "script.js",\n'
122
+ ' "content": "document.getElementById(\\"demo\\").innerHTML = \\"Hello JavaScript!\\";"\n'
123
+ " }\n"
124
+ "]\n"
125
+ "You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
126
+ " they will be created for you automatically. Try to write some realistic code keeping in mind that is should"
127
+ " look like the image as much as feasibly possible."
128
+ )
129
+
130
+ HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
131
+ SUBSETS = ["css", "html", "javascript"]
132
+ MAX_TRIES: int = 5
133
+ ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
134
+
135
+ name = "image2webpage"
136
+ description = "Evaluate multimodal models on webpage generation to recreate a provided image"
137
+
138
+ def __init__(
139
+ self,
140
+ subset: str,
141
+ recompile_prompt: bool = True,
142
+ split: str = VALID_SPLIT,
143
+ screenshot_options: ScreenshotOptions = ScreenshotOptions(),
144
+ ):
145
+ super().__init__(subset, recompile_prompt, split)
146
+ self._screenshot_options = screenshot_options
147
+ self._html2text = HTML2Text()
148
+ self._html2text.ignore_links = True
149
+
150
+ def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
151
+ """Extract the base64 encoding of the repo from the row and return it."""
152
+ # No need to reprocess if the assets are already saved
153
+ assets_save_path: str = os.path.join(assets_path, str(row["uuid"].replace('"', "")))
154
+ if os.path.exists(assets_save_path):
155
+ try:
156
+ with open(os.path.join(assets_save_path, "assets_paths.pkl"), "rb") as f:
157
+ row["assets_paths"] = pickle.load(f)
158
+ with open(os.path.join(assets_save_path, "assets_names.pkl"), "rb") as f:
159
+ row["assets_names"] = pickle.load(f)
160
+ del row["assets"]
161
+ row["structure"] = PROCESSED
162
+ return row
163
+ except Exception:
164
+ # There was an issue when loading the assets, reprocess
165
+ shutil.rmtree(assets_save_path)
166
+ ensure_directory_exists(assets_save_path)
167
+
168
+ # Structure is a base64 encoding of the repo
169
+ if self._output_path is None:
170
+ raise ValueError("Output path not set")
171
+ repo_path = os.path.join(self._output_path, f"tmp{threading.get_ident()}_{self._subset}")
172
+ ensure_directory_exists(repo_path)
173
+
174
+ # Decode the base64 string which corresponds to an archive
175
+ # and extract the files to the repo_path
176
+ structure: str = row["structure"]
177
+ extract_repo(structure, repo_path)
178
+ row["structure"] = PROCESSED
179
+ row["repo_path"] = repo_path # Stored for cleanup
180
+
181
+ # Process the assets
182
+ asset_paths: List[str] = list_assets(repo_path, self.ASSETS_EXTENSIONS)
183
+ del row["assets"]
184
+ row["assets_paths"] = []
185
+ row["assets_names"] = []
186
+ # Copy each asset to a unique persistent path
187
+ for i, asset_local_path in enumerate(asset_paths):
188
+ asset_name: str = asset_local_path
189
+ asset_dest_path = os.path.join(assets_save_path, f"{i}.{asset_local_path.split('.')[-1]}")
190
+ shutil.copyfile(os.path.join(row["repo_path"], asset_local_path), asset_dest_path)
191
+ row["assets_paths"].append(asset_dest_path)
192
+ row["assets_names"].append(asset_name)
193
+
194
+ # Save both assets_paths and assets_names as files than can be loaded
195
+ with open(os.path.join(assets_save_path, "assets_paths.pkl"), "wb") as f:
196
+ pickle.dump(row["assets_paths"], f)
197
+ with open(os.path.join(assets_save_path, "assets_names.pkl"), "wb") as f:
198
+ pickle.dump(row["assets_names"], f)
199
+
200
+ return row
201
+
202
+ def build_prompt(self, row: Dict[str, Any]) -> str:
203
+ prompt: str = self.BASE_PROMPT
204
+ assert "assets_paths" in row, "No assets paths in the row"
205
+ assert "assets_names" in row, "No assets names in the row"
206
+ assert len(row["assets_paths"]) == len(row["assets_names"])
207
+ if len(row["assets_names"]) > 0:
208
+ prompt += "\nYou have access to the following assets:\n"
209
+ for asset_local_path in row["assets_names"]:
210
+ prompt += f"- {asset_local_path}\n"
211
+ return prompt
212
+
213
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
214
+ # Structure is the path to the repo
215
+ # Serve and take screenshot
216
+ repo_path: str = structure
217
+ infos: Dict[str, Any] = serve_and_take_screenshot(repo_path, destination_path, self._screenshot_options)
218
+ text: str = convert_html_to_text(self._html2text, infos["html"])
219
+ return text
220
+
221
+ def finalize(self, row: Dict[str, Any]) -> None:
222
+ """Perform cleanup operations after the instance has been generated."""
223
+ if "repo_path" in row:
224
+ repo_path: str = row["repo_path"]
225
+ shutil.rmtree(repo_path)
@@ -0,0 +1,124 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ TRAIN_SPLIT,
11
+ Input,
12
+ Instance,
13
+ Output,
14
+ Reference,
15
+ Scenario,
16
+ )
17
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
18
+ from helm.common.media_object import MediaObject, MultimediaObject
19
+
20
+
21
+ class MementosScenario(Scenario):
22
+ """
23
+ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
24
+
25
+ We introduces Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning abilities. Mementos
26
+ features 4,761 diverse image sequences with varying lengths.
27
+
28
+ @misc{wang2024mementos,
29
+ title={Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences},
30
+ author={Xiyao Wang and Yuhang Zhou and Xiaoyu Liu and Hongjin Lu and Yuancheng Xu and Feihong He and Jaehong Yoon
31
+ and Taixi Lu and Gedas Bertasius and Mohit Bansal and Huaxiu Yao and Furong Huang},
32
+ year={2024},
33
+ eprint={2401.10529},
34
+ archivePrefix={arXiv},
35
+ primaryClass={cs.CV}
36
+ }
37
+
38
+ Paper: https://arxiv.org/abs/2401.10529
39
+ """
40
+
41
+ MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
42
+
43
+ IMAGE_URL: str = (
44
+ "https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
45
+ + "{subject}/{split}/{file_name}?download=true"
46
+ )
47
+
48
+ DATA_FILES: str = "{subject}/{split}/metadata.csv"
49
+
50
+ QUESTION_PROMPT: str = (
51
+ "Write a description for the given image sequence in a single paragraph, what is happening in this episode?"
52
+ )
53
+
54
+ SUBJECTS: List[str] = ["comics", "dailylife", "robotics"]
55
+
56
+ name = "mementos"
57
+ description = (
58
+ "A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
59
+ " ([paper](https://arxiv.org/abs/2401.10529))."
60
+ )
61
+ tags = ["vision-language"]
62
+
63
+ def __init__(self, subject: str):
64
+ super().__init__()
65
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
66
+ self._subject: str = subject
67
+
68
+ def get_instances(self, output_path: str) -> List[Instance]:
69
+ subject_output_path: str = os.path.join(output_path, self._subject)
70
+ ensure_directory_exists(subject_output_path)
71
+
72
+ data_files = {
73
+ split: self.DATA_FILES.format(subject=self._subject, split=split) for split in [TRAIN_SPLIT, TEST_SPLIT]
74
+ }
75
+ instances: List[Instance] = []
76
+
77
+ for split in [TRAIN_SPLIT, TEST_SPLIT]:
78
+ cur_output_path = os.path.join(subject_output_path, split)
79
+ ensure_directory_exists(cur_output_path)
80
+
81
+ # Process the test set
82
+ for row in tqdm(
83
+ load_dataset(
84
+ self.MEMENTOS_HUGGINGFACE_DATASET_NAME.format(subject=self._subject),
85
+ data_files=data_files,
86
+ split=split,
87
+ cache_dir=cur_output_path,
88
+ )
89
+ ):
90
+ # Download the image
91
+ file_name: str = row["file_name"]
92
+ local_image_path: str = os.path.join(cur_output_path, file_name)
93
+ ensure_file_downloaded(
94
+ source_url=self.IMAGE_URL.format(subject=self._subject, split=split, file_name=file_name),
95
+ target_path=local_image_path,
96
+ unpack=False,
97
+ )
98
+
99
+ content: List[MediaObject] = [
100
+ MediaObject(location=local_image_path, content_type="image/png"),
101
+ MediaObject(text=self.QUESTION_PROMPT, content_type="text/plain"),
102
+ ]
103
+ answer: str = row["description"]
104
+ instances.append(
105
+ Instance(
106
+ Input(multimedia_content=MultimediaObject(content)),
107
+ references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
108
+ split=split,
109
+ )
110
+ )
111
+
112
+ print()
113
+
114
+ return instances
115
+
116
+
117
+ def main():
118
+ scenario = MementosScenario("robotics")
119
+ instances = scenario.get_instances("output")
120
+ print(instances)
121
+
122
+
123
+ if __name__ == "__main__":
124
+ main()
@@ -0,0 +1,145 @@
1
+ import os.path
2
+ from typing import List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MMEScenario(Scenario):
21
+ """
22
+ MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
23
+
24
+ Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
25
+ multimodal tasks, showing amazing emergent abilities in recent studies. However,
26
+ it is difficult for these case studies to fully reflect the performance of MLLM,
27
+ lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
28
+ the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
29
+ and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
30
+ that may arise from direct use of public datasets for evaluation, the annotations
31
+ of instruction-answer pairs are all manually designed. The concise instruction design
32
+ allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
33
+ Besides, with such an instruction, we can also easily carry out quantitative
34
+ statistics. We rephrase the answer type of MME to multiple-choice question-answering.
35
+ We use the multiple-choice metrics for 14 different evaluation tasks.
36
+
37
+ @article{fu2023mme,
38
+ title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
39
+ author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
40
+ Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
41
+ Sun, Xing and Wu, Yunsheng and Ji, Rongrong},
42
+ journal={arXiv preprint arXiv:2306.13394},
43
+ year={2023}
44
+ }
45
+
46
+ Paper: https://arxiv.org/abs/2306.13394
47
+ """
48
+
49
+ MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
50
+
51
+ SUBJECTS: List[str] = [
52
+ "existence",
53
+ "scene",
54
+ "posters",
55
+ "color",
56
+ "OCR",
57
+ "position",
58
+ "celebrity",
59
+ "artwork",
60
+ "commonsense_reasoning",
61
+ "numerical_calculation",
62
+ "landmark",
63
+ "count",
64
+ "text_translation",
65
+ "code_reasoning",
66
+ ]
67
+
68
+ name = "mme"
69
+ description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2306.13394))."
70
+ tags = ["vision-language"]
71
+ options: List[str] = ["Yes", "No"]
72
+
73
+ def __init__(self, subject: str):
74
+ super().__init__()
75
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
76
+ self._subject: str = subject
77
+
78
+ def get_label_from_answer(self, answer: str):
79
+ label: str
80
+ if answer == "Yes":
81
+ label = "A"
82
+ elif answer == "No":
83
+ label = "B"
84
+ else:
85
+ raise NotImplementedError(f"Invalid answer: {answer}")
86
+ return label
87
+
88
+ def remove_question_suffix_for_mcqa(self, question: str):
89
+ return question.replace("Please answer yes or no.", "").strip()
90
+
91
+ def get_question_id(self, question_id: str):
92
+ return question_id.split(".")[0].replace("/", "-")
93
+
94
+ def get_instances(self, output_path: str) -> List[Instance]:
95
+ images_path: str = os.path.join(output_path, "images")
96
+ ensure_directory_exists(images_path)
97
+
98
+ # There is only the test split in Unicorn benchmark
99
+ instances: List[Instance] = []
100
+ # Process the test set
101
+ # Two open-ended generation instances and
102
+ # one multi-choice generation instance per row
103
+ for row in tqdm(
104
+ load_dataset(
105
+ self.MME_HUGGINGFACE_DATASET_NAME,
106
+ split=TEST_SPLIT,
107
+ cache_dir=output_path,
108
+ )
109
+ ):
110
+ if row["category"] != self._subject:
111
+ continue
112
+ question_id: str = self.get_question_id(row["question_id"])
113
+ # Save the image locally
114
+ image_path: str = os.path.join(images_path, f"{question_id}.png")
115
+ if not os.path.exists(image_path):
116
+ row["image"].save(image_path)
117
+
118
+ question: str = self.remove_question_suffix_for_mcqa(row["question"])
119
+ answer: str = row["answer"]
120
+ references: List[Reference] = []
121
+
122
+ answer = self.get_label_from_answer(answer)
123
+ # The given correct answer is a letter, but we need an index
124
+ correct_answer_index: int = ord(answer) - ord("A")
125
+ # The options are originally appended to the question
126
+
127
+ for i, option in enumerate(self.options):
128
+ reference: Reference
129
+ is_correct: bool = i == correct_answer_index
130
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
131
+ references.append(reference)
132
+
133
+ content = [
134
+ MediaObject(location=image_path, content_type="image/png"),
135
+ MediaObject(text=question, content_type="text/plain"),
136
+ ]
137
+ instances.append(
138
+ Instance(
139
+ Input(multimedia_content=MultimediaObject(content)),
140
+ references=references,
141
+ split=TEST_SPLIT,
142
+ )
143
+ )
144
+
145
+ return instances
@@ -0,0 +1,187 @@
1
+ import os.path
2
+ from typing import Dict, List
3
+
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ CORRECT_TAG,
9
+ VALID_SPLIT,
10
+ Instance,
11
+ Input,
12
+ Output,
13
+ Reference,
14
+ Scenario,
15
+ )
16
+ from helm.common.media_object import MediaObject, MultimediaObject
17
+ from helm.common.general import ensure_directory_exists
18
+
19
+
20
+ class MMMUScenario(Scenario):
21
+ """
22
+ MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI
23
+
24
+ We introduce MMMU: a new benchmark designed to evaluate multimodal models on massive multi-discipline
25
+ tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously
26
+ collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines:
27
+ Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.
28
+ These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such
29
+ as charts, diagrams, maps, tables, music sheets, and chemical structures.
30
+
31
+ @article{yue2023mmmu,
32
+ title={MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI},
33
+ author={Xiang Yue and Yuansheng Ni and Kai Zhang and Tianyu Zheng and Ruoqi Liu and Ge Zhang and Samuel
34
+ Stevens and Dongfu Jiang and Weiming Ren and Yuxuan Sun and Cong Wei and Botao Yu and Ruibin Yuan and
35
+ Renliang Sun and Ming Yin and Boyuan Zheng and Zhenzhu Yang and Yibo Liu and Wenhao Huang and Huan Sun
36
+ and Yu Su and Wenhu Chen},
37
+ journal={arXiv preprint arXiv:2311.16502},
38
+ year={2023},
39
+ }
40
+
41
+ Paper: https://arxiv.org/abs/2311.16502
42
+ Website: https://mmmu-benchmark.github.io/
43
+ """
44
+
45
+ MMMU_HUGGINGFACE_DATASET_NAME: str = "MMMU/MMMU"
46
+ MAX_NUM_IMAGES: int = 7
47
+
48
+ SUBJECTS: List[str] = [
49
+ "Accounting",
50
+ "Agriculture",
51
+ "Architecture_and_Engineering",
52
+ "Art",
53
+ "Art_Theory",
54
+ "Basic_Medical_Science",
55
+ "Biology",
56
+ "Chemistry",
57
+ "Clinical_Medicine",
58
+ "Computer_Science",
59
+ "Design",
60
+ "Diagnostics_and_Laboratory_Medicine",
61
+ "Economics",
62
+ "Electronics",
63
+ "Energy_and_Power",
64
+ "Finance",
65
+ "Geography",
66
+ "History",
67
+ "Literature",
68
+ "Manage",
69
+ "Marketing",
70
+ "Materials",
71
+ "Math",
72
+ "Mechanical_Engineering",
73
+ "Music",
74
+ "Pharmacy",
75
+ "Physics",
76
+ "Psychology",
77
+ "Public_Health",
78
+ "Sociology",
79
+ ]
80
+
81
+ name = "mmmu"
82
+ description = (
83
+ "Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
84
+ "subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
85
+ )
86
+ tags = ["vision-language"]
87
+
88
+ def __init__(self, subject: str, question_type: str):
89
+ super().__init__()
90
+ assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
91
+ self._subject: str = subject
92
+
93
+ assert question_type in ["multiple-choice", "open"], f"Invalid question type: {question_type}"
94
+ self._question_type: str = question_type
95
+
96
+ def get_instances(self, output_path: str) -> List[Instance]:
97
+ images_path: str = os.path.join(output_path, "images", self._subject)
98
+ ensure_directory_exists(images_path)
99
+
100
+ instances: List[Instance] = []
101
+
102
+ # Process the validation set
103
+ # There seems to be a dev set, but it's unavailable through load_dataset.
104
+ # When loading "dev", we get error: ValueError: Unknown split "dev". Should be one of ['test', 'validation'].
105
+ # The test set doesn't have answers, since the MMMU competition/leaderboard uses the test set
106
+ for row in tqdm(
107
+ load_dataset(self.MMMU_HUGGINGFACE_DATASET_NAME, self._subject, split="validation", cache_dir=output_path)
108
+ ):
109
+ # Skip questions that aren't in the subject we're interested in
110
+ if row["question_type"] != self._question_type:
111
+ continue
112
+
113
+ question_id: str = row["id"]
114
+ if self._subject not in question_id:
115
+ continue
116
+
117
+ question_template: str = row["question"]
118
+ options: List[str] = eval(row["options"]) # Convert the string to list of options
119
+ answer: str = row["answer"]
120
+
121
+ # Create the question. Questions can have text and images interleaved
122
+ question_template_to_image_path: Dict[str, str] = {}
123
+ content: List[MediaObject] = []
124
+ for img_number in range(1, self.MAX_NUM_IMAGES):
125
+ image_id: str = f"image_{img_number}"
126
+ if row[image_id] is None:
127
+ # At this point, there are no more images for this question
128
+ break
129
+
130
+ # Save the image locally
131
+ image_path: str = os.path.join(images_path, f"{question_id}_{image_id}.png")
132
+ if not os.path.exists(image_path):
133
+ row[image_id].save(image_path)
134
+
135
+ image_template_tag: str = f"<image {img_number}>"
136
+ question_template_to_image_path[image_template_tag] = image_path
137
+
138
+ # There are cases when the image is included, but it is not used either in the
139
+ # question template or in the answer options
140
+ if image_template_tag not in question_template:
141
+ # The image is not in the question template
142
+ continue
143
+
144
+ head, question_template = question_template.split(image_template_tag, 1)
145
+ if head:
146
+ content.append(MediaObject(text=head, content_type="text/plain"))
147
+ content.append(MediaObject(location=image_path, content_type="image/png"))
148
+
149
+ # Add the rest of the question template
150
+ if question_template:
151
+ content.append(MediaObject(text=question_template, content_type="text/plain"))
152
+
153
+ # Add the references
154
+ references: List[Reference] = []
155
+ if self._question_type == "multiple-choice":
156
+ # The given correct answer is a letter, but we need an index
157
+ correct_answer_index: int = ord(answer) - ord("A")
158
+
159
+ for i, option in enumerate(options):
160
+ reference: Reference
161
+ is_correct: bool = i == correct_answer_index
162
+
163
+ if option in question_template_to_image_path:
164
+ # Image answer choices
165
+ image_path = question_template_to_image_path[option]
166
+ image_choice = MediaObject(location=image_path, content_type="image/png")
167
+ reference = Reference(
168
+ Output(multimedia_content=MultimediaObject([image_choice])),
169
+ tags=[CORRECT_TAG] if is_correct else [],
170
+ )
171
+ else:
172
+ # Text answer choices
173
+ reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
174
+
175
+ references.append(reference)
176
+ else:
177
+ references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
178
+
179
+ instances.append(
180
+ Instance(
181
+ Input(multimedia_content=MultimediaObject(content)),
182
+ references=references,
183
+ split=VALID_SPLIT,
184
+ )
185
+ )
186
+
187
+ return instances