crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,214 @@
1
+ import os.path
2
+ from typing import List, Optional, Dict, Any
3
+ from abc import abstractmethod
4
+
5
+ from datasets import load_dataset
6
+ from tqdm import tqdm
7
+
8
+ from helm.benchmark.scenarios.scenario import (
9
+ CORRECT_TAG,
10
+ ASSET_NAME_TAG,
11
+ ASSET_PATH_TAG,
12
+ TEST_SPLIT,
13
+ VALID_SPLIT,
14
+ Instance,
15
+ Input,
16
+ Output,
17
+ Reference,
18
+ Scenario,
19
+ )
20
+ from helm.common.media_object import MediaObject, MultimediaObject
21
+ from helm.common.general import ensure_directory_exists
22
+ from helm.common.hierarchical_logger import hlog
23
+
24
+ PROCESSED: str = "processed"
25
+
26
+
27
+ class Image2StructureScenario(Scenario):
28
+ BASE_PROMPT: str
29
+ HUGGINGFACE_DATASET_NAME: str
30
+ SUBSETS: List[str]
31
+
32
+ name: str
33
+ description: str
34
+ tags = ["vision-language"]
35
+
36
+ helm_split_to_huggingface_split = {
37
+ TEST_SPLIT: "test",
38
+ VALID_SPLIT: "validation",
39
+ }
40
+
41
+ def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
42
+ super().__init__()
43
+ assert subset in self.SUBSETS, f"Invalid subset: {subset}"
44
+ self._subset: str = subset
45
+ self._recompile_prompt: bool = recompile_prompt
46
+ self._split: str = split
47
+ self._output_path: Optional[str] = None
48
+
49
+ def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
50
+ # By default, there are no assets
51
+ del row["assets"]
52
+ row["assets_paths"] = []
53
+ row["assets_names"] = []
54
+ return row
55
+
56
+ def build_prompt(self, row: Dict[str, Any]) -> str:
57
+ return self.BASE_PROMPT
58
+
59
+ @abstractmethod
60
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
61
+ """Compile the prompt, should save the image and return the text extracted from the image"""
62
+ pass
63
+
64
+ def finalize(self, row: Dict[str, Any]) -> None:
65
+ """Perform cleanup operations after the instance has been generated."""
66
+ pass
67
+
68
+ def get_instances(self, output_path: str) -> List[Instance]:
69
+ """Get the instances for the scenario. This compile_and_save method should be implemented by the subclass.
70
+ Additionally, the subclass should implement the preprocess_row method if any preprocessing is needed.
71
+
72
+ For each instance, the following steps are performed:
73
+ 1. Preprocess the row
74
+ 2. Save the image locally
75
+ - 2.a. If we don't want to recompile the prompt, save the image directly
76
+ - 2.b. If we want to recompile the prompt, compile the structure and save the image
77
+ 3. Create the prompt
78
+ 4. Create the multimedia content
79
+ 5. Create the reference
80
+ 6. Finalize the Instance
81
+
82
+ Args:
83
+ output_path (str): The path where the instances will be saved
84
+
85
+ Returns:
86
+ List[Instance]: The list of instances
87
+ """
88
+ self._output_path = output_path
89
+ images_path: str = os.path.join(output_path, "data/images", self._subset)
90
+ assets_path: str = os.path.join(output_path, "data/assets", self._subset)
91
+ ensure_directory_exists(images_path)
92
+ ensure_directory_exists(assets_path)
93
+
94
+ instances: List[Instance] = []
95
+
96
+ # Process the desired set of instances
97
+ for row in tqdm(
98
+ load_dataset(
99
+ self.HUGGINGFACE_DATASET_NAME,
100
+ self._subset,
101
+ split=self.helm_split_to_huggingface_split[self._split],
102
+ cache_dir=output_path,
103
+ )
104
+ ):
105
+ question_uuid: str = str(row["uuid"]).replace('"', "")
106
+ if row["category"][1:-1] != self._subset:
107
+ hlog(
108
+ f"Skipping instance {question_uuid} as it belong in category"
109
+ f" {row['category']} and not {self._subset}"
110
+ )
111
+ continue
112
+
113
+ # Step 1: Preprocess the row
114
+ row = self.preprocess_row(row, assets_path)
115
+
116
+ # Step 2: Save the image locally
117
+ image_path: str = os.path.join(images_path, f"{question_uuid}.png")
118
+ if not os.path.exists(image_path):
119
+ if not self._recompile_prompt: # 2.a
120
+ row["image"].save(image_path)
121
+ else: # 2.b
122
+ if "structure" not in row:
123
+ raise ValueError("Cannot recompile prompt without structure")
124
+ structure: str = row["structure"]
125
+ text: str = self.compile_and_save(structure, assets_path, image_path)
126
+ row["text"] = text
127
+
128
+ # Step 3: Create the prompt
129
+ prompt: str = self.build_prompt(row)
130
+
131
+ # Step 4: Create the multimedia content
132
+ image_object = MediaObject(location=image_path, content_type="image/png")
133
+ content: List[MediaObject] = [
134
+ MediaObject(text=prompt, content_type="text/plain"),
135
+ image_object,
136
+ ]
137
+
138
+ # Step 5: Create the references
139
+ # 5.a Create the reference containing the structure and the associated image.
140
+ reference: Reference
141
+ if "structure" in row:
142
+ multimedia_object: MultimediaObject
143
+ if os.path.exists(row["structure"]):
144
+ # 5.a.1 The structure is a path, therefore represent it as a multimedia object
145
+ # containing the files used to compile the structure (such as a repository
146
+ # containing the HTML, CSS, and JavaScript files used to generate a webpage)
147
+ multimedia_object = MultimediaObject(
148
+ [image_object, MediaObject(location=row["structure"], content_type="path/path")]
149
+ )
150
+ elif row["structure"] == PROCESSED:
151
+ # 5.a.2 The structure has been processed and is no longer present in the row
152
+ # This can be the case if the structure is a base64 encoding of an archive that
153
+ # has been extracted to a temporary path and processed but the path is no longer
154
+ # existing (deleted after the processing is done)
155
+ multimedia_object = MultimediaObject([image_object])
156
+ else:
157
+ # 5.a.3 The structure is not a path, therefore it is directly a valid string
158
+ # representing the structure (such as LaTeX code)
159
+ multimedia_object = MultimediaObject([image_object])
160
+ reference = Reference(
161
+ output=Output(text=row["text"], multimedia_content=multimedia_object),
162
+ tags=[CORRECT_TAG],
163
+ )
164
+ else:
165
+ if "text" in row:
166
+ reference = Reference(
167
+ output=Output(text=row["text"], multimedia_content=MultimediaObject([image_object])),
168
+ tags=[CORRECT_TAG],
169
+ )
170
+ else:
171
+ reference = Reference(
172
+ output=Output(multimedia_content=MultimediaObject([image_object])), tags=[CORRECT_TAG]
173
+ )
174
+ references: List[Reference] = [reference]
175
+
176
+ # 5.b Create the reference containing the assets
177
+ if len(row["assets_paths"]) > 0:
178
+ assets_paths_reference: Reference = Reference(
179
+ output=Output(
180
+ text=", ".join(
181
+ row["assets_paths"]
182
+ ), # TODO: This is for debugging purposes (to show in the frontend)
183
+ multimedia_content=MultimediaObject(
184
+ [
185
+ MediaObject(location=asset, content_type=f"image/{asset.split('.')[-1].lower()}")
186
+ for asset in row["assets_paths"]
187
+ ]
188
+ ),
189
+ ),
190
+ tags=[ASSET_PATH_TAG],
191
+ )
192
+ references.append(assets_paths_reference)
193
+ assets_names_reference: Reference = Reference(
194
+ output=Output(
195
+ text=", ".join(
196
+ row["assets_names"]
197
+ ), # TODO: This is for debugging purposes (to show in the frontend)
198
+ multimedia_content=MultimediaObject(
199
+ [MediaObject(text=asset, content_type="text/plain") for asset in row["assets_names"]]
200
+ ),
201
+ ),
202
+ tags=[ASSET_NAME_TAG],
203
+ )
204
+ references.append(assets_names_reference)
205
+
206
+ # Step 6: Finalize the Instance
207
+ self.finalize(row)
208
+ instance = Instance(
209
+ input=Input(multimedia_content=MultimediaObject(content)), references=references, split=self._split
210
+ )
211
+ instances.append(instance)
212
+
213
+ assert len(instances) > 0, f"No instances found for subject {self._subset}"
214
+ return instances
@@ -0,0 +1,25 @@
1
+ from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
+ from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
3
+ latex_to_image,
4
+ strip_unnecessary_latex_parts,
5
+ )
6
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
7
+
8
+
9
+ class LatexScenario(Image2StructureScenario):
10
+ BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
11
+ HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
12
+ SUBSETS = ["equation", "table", "plot", "algorithm"]
13
+
14
+ name = "image2latex"
15
+ description = "Evaluate multimodal models on Latex generation to recreate a provided image"
16
+
17
+ def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
18
+ super().__init__(subset, recompile_prompt, split)
19
+
20
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
21
+ image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
22
+ image.save(destination_path)
23
+ assert "latex_code" in infos
24
+ text: str = strip_unnecessary_latex_parts(infos["latex_code"])
25
+ return text
@@ -0,0 +1,20 @@
1
+ from helm.benchmark.scenarios.scenario import VALID_SPLIT
2
+ from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
3
+
4
+
5
+ class MusicSheetScenario(Image2StructureScenario):
6
+ BASE_PROMPT = (
7
+ "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasible possible.\n" # noqa: E501
8
+ "This music sheet was created by me, and I would like to recreate it using Lilypond."
9
+ )
10
+ HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
11
+ SUBSETS = ["music"]
12
+
13
+ name = "image2musicsheet"
14
+ description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
15
+
16
+ def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
17
+ super().__init__(subset, recompile_prompt, split)
18
+
19
+ def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
20
+ raise Exception("Music sheets have no ground truth, compilation is not possible")
@@ -0,0 +1,347 @@
1
+ from typing import Optional, Tuple, List, Dict, Any
2
+
3
+ import io
4
+ import os
5
+ import re
6
+
7
+ from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
8
+
9
+ try:
10
+ from latex import build_pdf
11
+ from pdf2image import convert_from_bytes
12
+ from PIL import ImageOps
13
+ from PIL.Image import Image
14
+ except ModuleNotFoundError as e:
15
+ handle_module_not_found_error(e, suggestions=["image2structure"])
16
+
17
+ # LaTeX preamble
18
+ # Make sure to install "latex-full".
19
+ TEX_INCLUDES = r"""
20
+ \usepackage{amsmath,amssymb,amsfonts}
21
+ \usepackage{graphicx}
22
+ \usepackage{graphicx}
23
+ \usepackage{amsmath}
24
+ \usepackage{xcolor}
25
+ \usepackage{algorithm}
26
+ \usepackage{algorithmicx}
27
+ \usepackage{algpseudocode}
28
+ \usepackage{listings}
29
+ \usepackage{stfloats}
30
+ \usepackage{epstopdf}
31
+ \usepackage{pgfplots}
32
+ \usepackage{tikz}
33
+ \usepackage{tikz-cd}
34
+ \usepackage{tikz-qtree}
35
+ \usepackage{tikz-dependency}
36
+ \usepackage{tikz-3dplot}
37
+ \usepackage{tikz-network}
38
+ \usepackage[flushleft]{threeparttable}
39
+ \usepackage{adjustbox}
40
+ """
41
+
42
+ # LaTeX delimiters
43
+ TEX_BEGIN_FILE = r"""\documentclass{article}"""
44
+ TEX_BEGIN_DOCUMENT = r"""\begin{document}"""
45
+ TEX_END_DOCUMENT = r"""\end{document}"""
46
+
47
+ # Number of times to try to fix the LaTeX code
48
+ MAX_NUM_TRIES: int = 3
49
+
50
+ TEX_BEGIN_DOCUMENT = r"""\begin{document}"""
51
+ TEX_END_DOCUMENT = r"""\end{document}"""
52
+
53
+ TEX_REPLACE_NUMBERING: List[Tuple[str, str]] = [
54
+ ("{equation}", "{equation*}"),
55
+ ("{align}", "{align*}"),
56
+ ("{alignat}", "{alignat*}"),
57
+ ("{gather}", "{gather*}"),
58
+ ("{flalign}", "{flalign*}"),
59
+ ("{multline}", "{multline*}"),
60
+ ("{eqnarray}", "{eqnarray*}"),
61
+ ("{subeqnarray}", "{subeqnarray*}"),
62
+ ("{multline}", "{multline*}"),
63
+ ("{aligneq}", "{aligneq*}"),
64
+ ]
65
+
66
+
67
+ def latex_to_pdf(latex_code: str, assets_path: str) -> io.BytesIO:
68
+ # Compiling LaTeX code to PDF
69
+ path = os.path.join(os.path.abspath(os.path.dirname(__file__)), assets_path)
70
+ pdf = build_pdf(latex_code, texinputs=[path, ""])
71
+ return io.BytesIO(pdf.data) # Convert PDF to a byte stream
72
+
73
+
74
+ def pdf_to_image(
75
+ pdf_stream: io.BytesIO,
76
+ crop: bool = False,
77
+ resize_to: Optional[Tuple[int, int]] = None,
78
+ ) -> Image:
79
+ # Convert the first page of the PDF stream to an image
80
+ images = convert_from_bytes(pdf_stream.read(), first_page=1, last_page=1)
81
+ if images:
82
+ image = images[0]
83
+
84
+ # Removes the white border around the image
85
+ if crop:
86
+ (w, h) = image.size
87
+ image = image.crop((0, 0, w, h - int(h * 0.2))) # Remove pagination
88
+ image = image.crop(ImageOps.invert(image).getbbox()) # Remove white border
89
+
90
+ # Resize the image
91
+ if resize_to:
92
+ image = image.resize(resize_to)
93
+
94
+ return image
95
+ else:
96
+ raise Exception("PDF to Image conversion failed")
97
+
98
+
99
+ def strip_unnecessary_latex_parts(latex_code: str) -> str:
100
+ """Strip unnecessary parts of the LaTeX code."""
101
+
102
+ # Remove comments
103
+ minimal_latex_code = re.sub(r"%.*?\n", "\n", latex_code)
104
+
105
+ # Remove \documentclass and any \usepackage lines
106
+ minimal_latex_code = re.sub(r"\\documentclass\{.*?\}\n", "", latex_code)
107
+ minimal_latex_code = re.sub(r"\\usepackage(\[.*?\])?\{.*?\}\n", "", minimal_latex_code)
108
+
109
+ # Remove everything before \begin{document} and including it, and everything after \end{document}
110
+ minimal_latex_code = re.sub(r"\\begin\{document\}\n*", "", minimal_latex_code, flags=re.DOTALL)
111
+ minimal_latex_code = re.sub(r"\\end\{document\}.*", "", minimal_latex_code, flags=re.DOTALL)
112
+
113
+ # Ensure \begin{...} is followed by a \n
114
+ minimal_latex_code = re.sub(r"(\\begin\{.*?\}(\[.*?\])?)(?!\n)", r"\1\n", minimal_latex_code)
115
+ # Ensure \end{...} has a \n before it
116
+ minimal_latex_code = re.sub(r"(\\end\{.*?\})(?!\n)", r"\1\n", minimal_latex_code)
117
+
118
+ # Normalize space sequences to a single space globally
119
+ minimal_latex_code = re.sub(r" +", " ", minimal_latex_code)
120
+ # Replace tabs with a single space
121
+ minimal_latex_code = re.sub(r"\t", " ", minimal_latex_code)
122
+ # Remove leading and trailing spaces on each line
123
+ minimal_latex_code = re.sub(r"^[ \t]+|[ \t]+$", "", minimal_latex_code, flags=re.MULTILINE)
124
+ # Remove unnecessary whitespace - multiple empty lines and tabulations
125
+ minimal_latex_code = re.sub(r"\n\s*\n", "\n", minimal_latex_code)
126
+
127
+ return minimal_latex_code.strip()
128
+
129
+
130
+ def handle_latex_error(
131
+ e: Exception,
132
+ original_latex_code: str,
133
+ assets_path: str,
134
+ crop: bool,
135
+ resize_to: Optional[Tuple[int, int]],
136
+ num_try_remaining: int,
137
+ ) -> Tuple[Image, Dict[str, Any]]:
138
+ # Check for error that are caused by the original LaTeX code itself
139
+ # and should not be fixed by trying again with a different code
140
+ # TODO #2346: Make this list more exhaustive
141
+ str_e: str = str(e).replace("\n", "")
142
+ # Source of the descriptions:
143
+ # - https://www.overleaf.com/learn/latex/Errors
144
+ # - https://tex.stackexchange.com/
145
+ for error_message in [
146
+ # This error occurs when LaTeX encounters an undefined control sequence
147
+ # Example: \blabla
148
+ r"""Undefined control sequence""",
149
+ # This error appears when you have forgotten to include an \item command.
150
+ # It can also appear from trying to use lists inside a table incorrectly.
151
+ # Example:
152
+ # \begin{itemize}
153
+ # First item without the \item command
154
+ # \end{itemize}
155
+ r"""LaTeX Error: Lonely \item--perhaps a missing list environment.""",
156
+ # This error occurs when a { or } is missing.
157
+ # Example: \sum_{i=1 ^n
158
+ r"""Missing } inserted""",
159
+ r"""Missing { inserted""",
160
+ # This error occurs when LaTeX encounters a double subscript.
161
+ # Example: a_b_c
162
+ r"""Double subscript.""",
163
+ # This error occurs when an environment or $ is added around something that cannot be typeset
164
+ # in the given mode.
165
+ # Example:
166
+ # $
167
+ # \begin{table}
168
+ # ...
169
+ # \end{table}
170
+ # $
171
+ r"""LaTeX Error: Not in outer par mode.""",
172
+ # This error occurs when LaTeX is typesetting a table and detects
173
+ # an alignment character ( & ) where it did not expect to find one
174
+ r"""Extra alignment tab has been changed to \cr.""",
175
+ # Missing control sequence othen than $ (which is handled elsewhere).
176
+ # Example: \left( without
177
+ "Missing \\",
178
+ # LaTeX Error: \begin{<env>} on input line <line> ended by \end{<diff_env>}
179
+ # This error occurs when LaTeX encounters an environment that is not properly closed.
180
+ # Example:
181
+ # \begin{table}
182
+ # ...
183
+ # \end{document}
184
+ r"""LaTeX Error: \begin{""",
185
+ # This error occurs when LaTeX encounters a \noalign command in the wrong place.
186
+ # Example:
187
+ # \begin{tabular}
188
+ # \noalign{\hrule}
189
+ # ...
190
+ # \end{tabular}
191
+ r"""Misplaced \noalign""",
192
+ # LaTeX Error: Command <command> already defined.
193
+ # This errors occurs when two packages define the same command.
194
+ # We cannot fix this as we would have to try to find the conflicting packages.
195
+ # Example:
196
+ # \usepackage{algorithmic}
197
+ # \usepackage{algorithmicx}
198
+ r""" already defined.""",
199
+ ]:
200
+ if error_message in str_e:
201
+ raise RuntimeError(str(e)) from e
202
+
203
+ if num_try_remaining > 0:
204
+ # Check if the error is easily fixable
205
+ fixed_code: str = original_latex_code
206
+
207
+ # Equation not in math mode
208
+ # We correct this error as the prompt might not be obvious if the output should be:
209
+ # <EQUATION_CODE> or $<EQUATION_CODE>$.
210
+ # We only handle this cas and that is why we add the $ at the beginning and end of the equation.
211
+ # The missing $ might come from elsewhere but then, it is a problem of the generated code,
212
+ # and not some unclear instructions, so we do not handle it.
213
+ # Error format: "Missing $ inserted" or "<command> allowed only in math mode"
214
+ if "Missing $ inserted" in str(e) or " allowed only in math mode" in str_e:
215
+ # Only wrap the content after \begin{document} and before \end{document}
216
+ fixed_code = re.sub(
217
+ r"(?<=\\begin{document})(.*?)(?=\\end{document})",
218
+ r"$$\1$$",
219
+ fixed_code,
220
+ flags=re.DOTALL,
221
+ ) # Use \begin{equation} instead of $ to avoid inline mode
222
+
223
+ # Missing include
224
+ # Missing includes are tolerated as the prompt suggests that it is not necessary to include them,
225
+ # and our TEX_INCLUDES might lack some packages.
226
+ # Error format: "LaTeX Error: Environment <env> undefined."
227
+ undefined_search = re.search(r"LaTeX Error: Environment (.*) undefined", str_e)
228
+ if undefined_search:
229
+ # If a package is missing and this is our first retry, then simply include TEX_INCLUDES
230
+ if num_try_remaining == MAX_NUM_TRIES:
231
+ fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
232
+ if num_try_remaining < MAX_NUM_TRIES or fixed_code == original_latex_code:
233
+ # Here we try to manually solve the missing environment.
234
+ # This is either executed on the second rety or the first if no changements
235
+ # were made in the first retry.
236
+ assert TEX_INCLUDES in fixed_code, "TEX_INCLUDES should be present in the code"
237
+ # TEX_INCLUDES is already present, so we add the missing package
238
+ # Since we cannot know the name of the package that contains the missing environment,
239
+ # we simply hope that they are named the same way.
240
+ env_undefined: str = undefined_search.group(1)
241
+
242
+ if f"\\usepackage{{{env_undefined}}}" in fixed_code:
243
+ # We already tried to include the missing package, but it probably
244
+ # does not exist, so we raise an error
245
+ raise RuntimeError(str(e)) from e
246
+
247
+ fixed_code = fixed_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + f"\n\\usepackage{{{env_undefined}}}\n")
248
+
249
+ # Try again with the fixed code (if the fixed code is different from the original code)
250
+ if fixed_code != original_latex_code:
251
+ return latex_to_image(
252
+ fixed_code,
253
+ assets_path=assets_path,
254
+ crop=crop,
255
+ resize_to=resize_to,
256
+ num_try_remaining=num_try_remaining - 1,
257
+ )
258
+
259
+ # TODO #2346: Ideally we should never reach this point
260
+ # All errors should be either detected as:
261
+ # - generation error: should not be fixed and raised
262
+ # - easily fixable: should be fixed and tried again
263
+ # If we reach this point, it means that none of the above cases were detected.
264
+ raise RuntimeError(str(e)) from e
265
+
266
+
267
+ def latex_to_image(
268
+ original_latex_code: str,
269
+ assets_path: str,
270
+ crop: bool = False,
271
+ resize_to: Optional[Tuple[int, int]] = None,
272
+ num_try_remaining: int = MAX_NUM_TRIES,
273
+ ) -> Tuple[Image, Dict[str, Any]]:
274
+ """Convert a LaTeX code to an image.
275
+
276
+ Args:
277
+ original_latex_code (str): The LaTeX code to convert to an image.
278
+ assets_path (str): The path to the assets.
279
+ crop (bool, optional): Whether to crop the image. Defaults to False.
280
+ resize_to (Optional[Tuple[int, int]], optional): The size to resize the image to. Defaults to None.
281
+ num_try_remaining (int, optional): The number of tries remaining. Defaults to MAX_NUM_TRIES.
282
+
283
+ Returns:
284
+ image (Image): The image of the LaTeX code.
285
+ infos (Dict[str, Any]): a dictionnary containing:
286
+ size (Tuple[int, int]): The size of the image.
287
+ latex_code (str): The modified LaTeX code that was successfully compiled.
288
+
289
+ Raises:
290
+ OptionalDependencyNotInstalled: If LaTeX is not installed.
291
+ RuntimeError: If the LaTeX code cannot be converted to an image.
292
+ """
293
+ # Basic LaTeX processing
294
+ # This changes cannot break the original LaTeX code
295
+ # Other processing will be done in the handle_latex_error function
296
+ # but these might break the original LaTeX code so they are only applied
297
+ # if the original LaTeX code does not compile.
298
+
299
+ # 0. Remove all environments that might cause numbering
300
+ # This is important because the numbering of the equations might change
301
+ # the bounding box of the image.
302
+ for replace in TEX_REPLACE_NUMBERING:
303
+ original_latex_code = original_latex_code.replace(replace[0], replace[1])
304
+ # Also removes all \label commands
305
+ # If it is followed by a \n, it should be removed as well
306
+ original_latex_code = re.sub(r"\\label\{.*?\}[\t ]*(\n)?", "", original_latex_code)
307
+
308
+ # 1. Add begin/end document if not present
309
+ if TEX_BEGIN_DOCUMENT not in original_latex_code and TEX_BEGIN_FILE not in original_latex_code:
310
+ original_latex_code = TEX_BEGIN_DOCUMENT + original_latex_code
311
+ if TEX_END_DOCUMENT not in original_latex_code:
312
+ original_latex_code = original_latex_code + TEX_END_DOCUMENT
313
+
314
+ # 2. Add preamble
315
+ # 2.1. Remove \documentclass if present to make sure we use our own
316
+ documentclass_search = re.search(r"\\documentclass\{(.*)\}", original_latex_code)
317
+ if documentclass_search:
318
+ documentclass: str = documentclass_search.group(1)
319
+ original_latex_code = original_latex_code.replace(f"\\documentclass{{{documentclass}}}", TEX_BEGIN_FILE)
320
+ else:
321
+ # If there is no \documentclass, we add our own
322
+ original_latex_code = TEX_BEGIN_FILE + "\n\n" + original_latex_code
323
+
324
+ # 2.2. Add includes. In this first step, we only add includes if none are present.
325
+ # We do this because if some are present, we might define them twice which can cause errors
326
+ # and this section should not make the original LaTeX code fail if it was compilable.
327
+ # If there are missing packages, in handle_latex_error, we will add TEX_INCLUDES after the begin document,
328
+ # which might define some packages twice, but often solves the problem.
329
+ if not re.search(r"\\usepackage\{.*\}", original_latex_code):
330
+ original_latex_code = original_latex_code.replace(TEX_BEGIN_FILE, TEX_BEGIN_FILE + "\n" + TEX_INCLUDES + "\n")
331
+
332
+ latex_code: str = original_latex_code
333
+ try:
334
+ pdf_stream = latex_to_pdf(latex_code, assets_path=assets_path)
335
+ image = pdf_to_image(pdf_stream, crop=crop, resize_to=resize_to)
336
+ return image, {"image_size": image.size, "latex_code": latex_code}
337
+ except RuntimeError as e:
338
+ if str(e) == "No available builder could be instantiated. Please make sure LaTeX is installed.":
339
+ raise OptionalDependencyNotInstalled(
340
+ "Optional dependency LaTeX is not installed. "
341
+ "Please install LaTeX and make sure it is available in your PATH."
342
+ "You can install LaTeX on Ubuntu with `sudo apt-get install texlive-full`."
343
+ ) from e
344
+ else:
345
+ return handle_latex_error(e, original_latex_code, assets_path, crop, resize_to, num_try_remaining)
346
+ except Exception as e:
347
+ return handle_latex_error(e, original_latex_code, assets_path, crop, resize_to, num_try_remaining)