crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,450 @@
1
+ from typing import List, Dict, Optional, Callable, Tuple, Any, Set
2
+ from dataclasses import dataclass
3
+ from torchvision import transforms, models
4
+ from skimage.metrics import structural_similarity as ssim
5
+ from nltk.tokenize.treebank import TreebankWordTokenizer
6
+
7
+ import torch
8
+ import warnings
9
+ import numpy as np
10
+ import os
11
+ import tempfile
12
+
13
+ from helm.benchmark.metrics.copyright_metrics import _edit_similarity
14
+ from helm.benchmark.metrics.metric import Metric
15
+ from helm.benchmark.metrics.metric_service import MetricService
16
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
17
+ from helm.common.images_utils import open_image
18
+ from helm.common.gpu_utils import get_torch_device
19
+ from helm.common.cache import Cache
20
+ from helm.benchmark.adaptation.request_state import RequestState
21
+ from helm.common.media_object import MediaObject
22
+ from helm.common.optional_dependencies import handle_module_not_found_error
23
+ from helm.common.hierarchical_logger import hlog
24
+ from helm.benchmark.metrics.metric_name import MetricName
25
+ from helm.benchmark.metrics.statistic import Stat
26
+ from helm.benchmark.metrics.vision_language.image_utils import (
27
+ preprocess_image,
28
+ pixel_similarity,
29
+ sift_similarity,
30
+ )
31
+ from helm.benchmark.metrics.vision_language.emd_utils import compute_emd_recursive
32
+
33
+ try:
34
+ from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
35
+ from PIL import Image
36
+ import imagehash
37
+ except ModuleNotFoundError as e:
38
+ handle_module_not_found_error(e, suggestions=["image2structure"])
39
+
40
+
41
+ def pad(small_image: Image.Image, large_image: Image.Image, axis: int) -> Image.Image:
42
+ """Pad the axis of the small image to match the size of the large image."""
43
+ new_dim: List[int] = list(small_image.size)
44
+ new_dim[axis] = large_image.size[axis]
45
+ new_dim_tupe: Tuple[int, int] = tuple(new_dim) # type: ignore
46
+ new_image: Image.Image = Image.new("RGB", new_dim_tupe, (255, 255, 255))
47
+ new_image.paste(small_image, (0, 0))
48
+ return new_image
49
+
50
+
51
+ class CompilationError(Exception):
52
+ pass
53
+
54
+
55
+ @dataclass
56
+ class AnnotatedMetric:
57
+ name: str
58
+ function: Callable
59
+ input_type: str
60
+
61
+
62
+ class AnnotatedImageMetrics(Metric):
63
+ """Abstract class for image metrics.
64
+
65
+ This class is designed to evaluate metrics on images that should be generated using the text
66
+ ouput of the model, such as LaTeX, HTML, etc.
67
+
68
+ The class provides a method to compile the completion into an image and then evaluate the
69
+ similarity between the generated image and the reference image using different metrics.
70
+
71
+ In addition to the metrics, the class also provides a metric to evaluate the compilation success.
72
+ If the compilation fails, the similarity metrics are not evaluated and are all set to the most
73
+ dissimilar value.
74
+
75
+ Since compilation can be expensive, the class provides a cache to store the compiled images.
76
+ In addition metrics can also be cached to avoid recomputation.
77
+ """
78
+
79
+ # Metric names
80
+ COMPILE_METRIC: str = "compilation_success"
81
+ EARTH_MOVER_SIMILARITY: str = "earth_mover_similarity"
82
+ PIXEL_SIMILARITY: str = "pixel_similarity"
83
+ SIFT_SIMILARITY: str = "sift_similarity"
84
+ LPIPS_SIMILARITY: str = "lpips_similarity"
85
+ SSIM_SIMILARITY: str = "ssim_similarity"
86
+ FID_SIMILARITY: str = "fid_similarity"
87
+ EDIT_SIMILARITY: str = "edit_similarity"
88
+ NORMALIZE_FID_FACTOR: float = 0.0025
89
+
90
+ SIZE_HANDLING_METHODS: List[str] = ["resize", "padding", "none"]
91
+
92
+ # Hashing (for caching)
93
+ HASH_LENGTH: int = 16
94
+ HASH_FUNC: Callable = imagehash.average_hash
95
+
96
+ def __init__(self, generation_type: str, metric_names: List[str], size_handling_method: str = "resize"):
97
+ self.generation_type = generation_type
98
+ self._metric_names: List[str] = metric_names
99
+ self._lpips_metric: Optional[LearnedPerceptualImagePatchSimilarity] = None
100
+ self._inception_model: Optional[models.Inception3] = None
101
+ self._device = get_torch_device()
102
+ self._cache: Optional[Cache] = None
103
+ self._size_handling_method: str = size_handling_method
104
+ self._tokenizer = TreebankWordTokenizer()
105
+
106
+ metrics: List[AnnotatedMetric] = [
107
+ AnnotatedMetric(self.PIXEL_SIMILARITY, pixel_similarity, "image_np_gray"),
108
+ AnnotatedMetric(self.SIFT_SIMILARITY, sift_similarity, "image_np"),
109
+ AnnotatedMetric(self.EARTH_MOVER_SIMILARITY, self.compute_emd_similarity_recursive, "image_PIL"),
110
+ AnnotatedMetric(self.LPIPS_SIMILARITY, self.lpips_similarity, "image_PIL"),
111
+ AnnotatedMetric(self.FID_SIMILARITY, self.fid_similarity, "image_PIL"),
112
+ AnnotatedMetric(self.SSIM_SIMILARITY, self.compute_ssim, "image_np_gray"),
113
+ AnnotatedMetric(self.EDIT_SIMILARITY, self.compute_edit_sim, "text_str"),
114
+ ]
115
+ self.metrics: Dict[str, AnnotatedMetric] = {metric.name: metric for metric in metrics}
116
+
117
+ def _get_compilation_cache_key(self, completion: str) -> Dict[str, str]:
118
+ return {
119
+ "generation_type": self.generation_type,
120
+ "completion": completion,
121
+ }
122
+
123
+ def _prepare_inputs(
124
+ self,
125
+ inputs_required: Set[str],
126
+ request_state: RequestState,
127
+ annotation: Dict[str, Any],
128
+ ref_image: Optional[Image.Image],
129
+ ) -> Dict[str, Tuple[Any, Any]]:
130
+ inputs: Dict[str, Tuple[Any, Any]] = {}
131
+
132
+ # Image
133
+ if any([input_type.startswith("image") for input_type in inputs_required]):
134
+ # Get the image and make sure we have a reference image
135
+ assert ref_image is not None
136
+ assert "media_object" in annotation
137
+ assert isinstance(annotation["media_object"], MediaObject)
138
+ media_object: MediaObject = annotation["media_object"]
139
+ assert media_object.type == "image"
140
+ assert media_object.is_local_file and media_object.location is not None
141
+ image: Image.Image = Image.open(media_object.location).convert("RGB")
142
+
143
+ # Handle difference in size
144
+ if image.size != ref_image.size:
145
+ if self._size_handling_method == "none":
146
+ raise ValueError(
147
+ "Compiled image and reference image should have the same size"
148
+ " when the size handling method is none."
149
+ )
150
+ elif self._size_handling_method == "resize":
151
+ image = image.resize(ref_image.size)
152
+ elif self._size_handling_method == "padding":
153
+ for axis in range(2):
154
+ if image.size[axis] < ref_image.size[axis]:
155
+ image = pad(image, ref_image, axis)
156
+ elif image.size[axis] > ref_image.size[axis]:
157
+ ref_image = pad(ref_image, image, axis)
158
+ else:
159
+ raise ValueError(f"size handling method {self._size_handling_method} not recognized.")
160
+ assert image.size == ref_image.size
161
+
162
+ # Save the inputs
163
+ inputs["image_PIL"] = (image, ref_image)
164
+
165
+ # Convert to numpy array
166
+ if "image_np" in inputs_required:
167
+ rgb_ref_image: np.ndarray = np.array(ref_image)
168
+ rgb_image: np.ndarray = np.array(image)
169
+ inputs["image_np"] = (rgb_image, rgb_ref_image)
170
+ if "image_np_gray" in inputs_required:
171
+ gray_ref_image: np.ndarray = preprocess_image(ref_image)
172
+ gray_image: np.ndarray = preprocess_image(image)
173
+ inputs["image_np_gray"] = (gray_image, gray_ref_image)
174
+
175
+ # Text
176
+ if any([input_type.startswith("text") for input_type in inputs_required]):
177
+ assert "text" in annotation
178
+ text: str = annotation["text"]
179
+ reference = request_state.instance.references[0]
180
+ inputs["text_str"] = (text, reference.output.text)
181
+
182
+ # Check that all inputs are present
183
+ SUPPORTED_INPUTS: List[str] = ["image_PIL", "image_np", "image_np_gray", "text_str"]
184
+ for input_type in inputs_required:
185
+ if input_type not in SUPPORTED_INPUTS:
186
+ raise AssertionError(f"Input type {input_type} is not supported.")
187
+ if input_type not in inputs:
188
+ raise ValueError(f"Input type {input_type} is required for the metrics but not present.")
189
+
190
+ return inputs
191
+
192
+ def evaluate_generation(
193
+ self,
194
+ adapter_spec: AdapterSpec,
195
+ request_state: RequestState,
196
+ metric_service: MetricService,
197
+ eval_cache_path: str,
198
+ ) -> List[Stat]:
199
+ compiler_name: str = f"{self.generation_type}_compiler"
200
+ if self._cache is None:
201
+ self._cache = metric_service.get_cache(f"image_metrics_{self.generation_type}")
202
+
203
+ stats_dict: Dict[str, Stat] = {
204
+ name: Stat(MetricName(name)) for name in (self._metric_names + [self.COMPILE_METRIC])
205
+ }
206
+
207
+ if request_state.annotations is None or request_state.result is None:
208
+ raise ValueError(
209
+ "Annotations and results should be present.",
210
+ " Please make sure to add a compiler annotator to the run spec.",
211
+ )
212
+ if compiler_name not in request_state.annotations:
213
+ raise ValueError(f"Compiler {compiler_name} should be present in the annotations.")
214
+
215
+ inputs_required: Set[str] = set()
216
+ for metric_name in self._metric_names:
217
+ inputs_required.add(self.metrics[metric_name].input_type)
218
+
219
+ # Get the image reference (only once as opening an image is slow)
220
+ # The text annotation can be loaded several times without performance issues
221
+ reference = request_state.instance.references[0]
222
+ ref_image: Optional[Image.Image] = None
223
+ if any([input_type.startswith("image") for input_type in inputs_required]):
224
+ assert reference.output.multimedia_content is not None
225
+ assert len(reference.output.multimedia_content.media_objects) > 0
226
+ ref_media_object: MediaObject = reference.output.multimedia_content.media_objects[0]
227
+ assert ref_media_object.type == "image"
228
+ if ref_media_object.is_local_file and ref_media_object.location is not None:
229
+ ref_image = open_image(ref_media_object.location)
230
+ else:
231
+ raise Exception(
232
+ "Remote images are not supported in metrics. "
233
+ "Images should be downloaded when constructing the instance."
234
+ )
235
+
236
+ # For each completion, evaluate the metrics
237
+ assert request_state.result is not None
238
+ for completion_index in range(len(request_state.result.completions)):
239
+ annotation: Dict[str, Any] = request_state.annotations[compiler_name][completion_index]
240
+
241
+ # Handle errors in annotation
242
+ if "unknown_error" in annotation:
243
+ hlog(
244
+ f"Unknown error in annotation: {annotation['unknown_error']}\n"
245
+ f"Scores of zero will be returned for all metrics."
246
+ )
247
+ if "error" in annotation or "unknown_error" in annotation:
248
+ stats_dict[self.COMPILE_METRIC].add(0) # Did not compile
249
+ # For all other metrics, we set the value to zero
250
+ for metric_name in self._metric_names:
251
+ stats_dict[metric_name].add(0)
252
+ continue
253
+
254
+ # Get te inputs
255
+ inputs = self._prepare_inputs(inputs_required, request_state, annotation, ref_image)
256
+
257
+ # Hash the images for the cache key
258
+ hash_dict: Optional[Dict[str, str]] = None
259
+ if "image_PIL" in inputs:
260
+ (image, _) = inputs["image_PIL"]
261
+ hash_dict = {
262
+ "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
263
+ "generated_image": str(AnnotatedImageMetrics.HASH_FUNC(image, hash_size=self.HASH_LENGTH)),
264
+ }
265
+
266
+ # Evaluate the metrics
267
+ for metric_name in self._metric_names:
268
+ metric: AnnotatedMetric = self.metrics[metric_name]
269
+ (pred, gt) = inputs[metric.input_type]
270
+
271
+ value: float
272
+ try:
273
+
274
+ def do_it():
275
+ value = metric.function(pred, gt)
276
+ return {"value": value}
277
+
278
+ cache_key = {"metric_name": metric_name, "pred": pred, "gt": gt}
279
+ if not isinstance(pred, str):
280
+ assert hash_dict is not None
281
+ cache_key = {"metric_name": metric_name, **hash_dict}
282
+ response_metric, _ = self._cache.get(cache_key, do_it)
283
+ value = response_metric["value"]
284
+ except Exception as e:
285
+ hlog(f"Error in metric {metric_name}: {str(e)}")
286
+ value = 0
287
+ stats_dict[metric_name].add(value)
288
+
289
+ stats_dict[self.COMPILE_METRIC].add(1) # Compiled
290
+
291
+ return list(stats_dict.values())
292
+
293
+ def lpips_similarity(self, generated_image: Image.Image, reference_image: Image.Image) -> float:
294
+ """Compute the LPIPS similarity between the generated and reference images.
295
+
296
+ This metric is defined here as it requires loading the LPIPS model.
297
+ Storing the model in this class is easier than passing it as an argument.
298
+ """
299
+ if self._lpips_metric is None:
300
+ with warnings.catch_warnings():
301
+ warnings.filterwarnings("ignore", category=UserWarning)
302
+ self._lpips_metric = LearnedPerceptualImagePatchSimilarity(net_type="vgg").to(self._device)
303
+
304
+ preprocessing = transforms.Compose(
305
+ [
306
+ transforms.Resize((256, 256)),
307
+ transforms.ToTensor(),
308
+ transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
309
+ ]
310
+ )
311
+ generated_image_tensor = preprocessing(generated_image)
312
+ reference_image_tensor = preprocessing(reference_image)
313
+
314
+ # Add batch dimension (B, C, H, W) since torchmetrics expects batches
315
+ img1 = generated_image_tensor.unsqueeze(0).to(self._device)
316
+ img2 = reference_image_tensor.unsqueeze(0).to(self._device)
317
+
318
+ # Compute the LPIPS score
319
+ assert self._lpips_metric is not None
320
+ score: float = self._lpips_metric(img1, img2).detach().item()
321
+ return score
322
+
323
+ def _calculate_fid(self, act1, act2):
324
+ # Directly use the provided activations, assuming they are already means
325
+ mu1, mu2 = act1[0], act2[0] # Assuming act1 and act2 are of shape (1, 1000)
326
+
327
+ # Since we cannot compute a meaningful covariance matrix for single observations,
328
+ # and the provided sigma is scalar (not meaningful in this context),
329
+ # we'll skip the covariance part of the standard FID calculation.
330
+ # This is a significant deviation from the FID's intended use.
331
+
332
+ # Compute the square difference between the means
333
+ ssdiff = np.sum((mu1 - mu2) ** 2.0)
334
+
335
+ # Placeholder for FID score since we're not using covariance matrices
336
+ fid = ssdiff # This is not a standard FID calculation.
337
+
338
+ return fid
339
+
340
+ def _get_inception_features(self, img_tensor):
341
+ if self._inception_model is None:
342
+
343
+ def load_inception_model():
344
+ return models.inception_v3(weights=models.Inception_V3_Weights.IMAGENET1K_V1, transform_input=False).to(
345
+ self._device
346
+ )
347
+
348
+ try:
349
+ self._inception_model = load_inception_model()
350
+ except PermissionError:
351
+ # If access denied, use a temporary directory
352
+ hlog("Access denied to torch cache directory. Using a temporary directory.")
353
+ temp_cache_dir = tempfile.mkdtemp()
354
+ os.environ["TORCH_HOME"] = temp_cache_dir
355
+ self._inception_model = load_inception_model()
356
+ self._inception_model.eval()
357
+ with torch.no_grad():
358
+ if self._inception_model.training:
359
+ self._inception_model.eval()
360
+ pred = self._inception_model(img_tensor)
361
+ return pred.cpu().detach().numpy()
362
+
363
+ def _preprocess_image(self, image):
364
+ # Source: https://pytorch.org/hub/pytorch_vision_inception_v3/
365
+ preprocess = transforms.Compose(
366
+ [
367
+ transforms.Resize(299),
368
+ transforms.CenterCrop(299),
369
+ transforms.ToTensor(),
370
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
371
+ ]
372
+ )
373
+ return preprocess(image)
374
+
375
+ def fid_similarity(self, generated_image: Image.Image, reference_image: Image.Image) -> float:
376
+ """Compute the Frechet Inception Distance (FID) between the generated and reference images.
377
+
378
+ This metric is defined here as it requires loading the Inception model.
379
+ Storing the model in this class is easier than passing it as an argument.
380
+ """
381
+ img1_tensor = self._preprocess_image(generated_image).unsqueeze(0).to(self._device)
382
+ img2_tensor = self._preprocess_image(reference_image).unsqueeze(0).to(self._device)
383
+
384
+ features1 = self._get_inception_features(img1_tensor)
385
+ features2 = self._get_inception_features(img2_tensor)
386
+
387
+ fid_score = self._calculate_fid(features1, features2)
388
+ normalize_fid: float = np.exp(-fid_score * self.NORMALIZE_FID_FACTOR)
389
+ return normalize_fid
390
+
391
+ def compute_ssim(self, generated_image: np.ndarray, reference_image: np.ndarray) -> float:
392
+ """Compute the Structural Similarity Index (SSIM) between the generated and reference images."""
393
+ return ssim(generated_image, reference_image)
394
+
395
+ def compute_edit_sim(self, completion: str, reference: str) -> float:
396
+ # `reference` is the entire remaining book for each instance.
397
+ # Truncate it here to be of the same length as the completion to ensure edit-distance is meaningful.
398
+ truncated_reference: str = reference[: len(completion)]
399
+
400
+ completion_tokens = self._tokenizer.tokenize(completion)
401
+ truncated_reference_tokens = self._tokenizer.tokenize(truncated_reference)
402
+
403
+ # Exploit numpy SIMD for efficiency on CPUs.
404
+ completion_tokens = np.array(completion_tokens)
405
+ truncated_reference_tokens = np.array(truncated_reference_tokens)
406
+
407
+ result = _edit_similarity(completion_tokens, truncated_reference_tokens)
408
+ return result
409
+
410
+ def compute_emd_similarity_recursive(
411
+ self,
412
+ pred_image: Image.Image,
413
+ ref_image: Image.Image,
414
+ threshold_most_frequent_color: float = 0.5,
415
+ patch_size: Tuple[int, int] = (8, 8),
416
+ max_num_patches: int = 100,
417
+ weight_most_frequent_color: float = 0.001,
418
+ use_tqdm: bool = False,
419
+ ):
420
+ emd_value = compute_emd_recursive(
421
+ pred_image,
422
+ ref_image,
423
+ threshold_most_frequent_color,
424
+ patch_size,
425
+ max_num_patches,
426
+ weight_most_frequent_color,
427
+ use_tqdm,
428
+ )
429
+
430
+ def do_it():
431
+ constant_image = Image.new("RGB", ref_image.size, (255, 255, 255)) # default color is white
432
+ value = compute_emd_recursive(
433
+ constant_image,
434
+ ref_image,
435
+ threshold_most_frequent_color,
436
+ patch_size,
437
+ max_num_patches,
438
+ weight_most_frequent_color,
439
+ use_tqdm,
440
+ )
441
+ return {"value": value}
442
+
443
+ hash_dict = {
444
+ "reference_image": str(AnnotatedImageMetrics.HASH_FUNC(ref_image, hash_size=self.HASH_LENGTH)),
445
+ }
446
+ cache_key = {"metric_name": f"intermediate_{self.EARTH_MOVER_SIMILARITY}", **hash_dict}
447
+ assert self._cache is not None
448
+ response_metric, _ = self._cache.get(cache_key, do_it)
449
+
450
+ return 1.0 - emd_value / response_metric["value"]
@@ -0,0 +1,100 @@
1
+ import numpy as np
2
+
3
+ from helm.common.optional_dependencies import handle_module_not_found_error
4
+
5
+ try:
6
+ import cv2
7
+ from PIL.Image import Image
8
+ except ModuleNotFoundError as e:
9
+ handle_module_not_found_error(e, suggestions=["image2structure"])
10
+
11
+
12
+ def preprocess_image(image: Image) -> np.ndarray:
13
+ """Preprocesses an image for use in metrics.
14
+ Returns a grayscale image stored using int in a numpy array.
15
+ Also normalizes the exposure of the image.
16
+ """
17
+ image = image.convert("L")
18
+ np_image = np.array(image)
19
+ assert np_image.dtype == np.uint8
20
+ return np_image
21
+
22
+
23
+ def pixel_similarity(img_a: np.ndarray, img_b: np.ndarray, threshold: float = 0.5, tolerance: float = 0.02) -> float:
24
+ """
25
+ Measure the pixel-level similarity between two images
26
+ If the image has a color that occurs more than 100 * threshold percent of the time,
27
+ Then the associated pixels are ignored and the match is computed only on the other pixels.
28
+ A tolerance is used to compare each pixels to allow some small variations in color.
29
+ The tolerance is between 0 (exact match) and 1 (every color is ok)
30
+
31
+ Args:
32
+ img_a (np.ndarray): the first image
33
+ img_b (np.ndarray): the second image
34
+ threshold (float): Threshold to ignore dominant colors.
35
+ tolerance (float): Tolerance for color variation.
36
+ Returns:
37
+ float: the pixel-level similarity between the images (between 0 and 1)
38
+ """
39
+ if img_a.shape != img_b.shape:
40
+ raise ValueError(
41
+ f"Images must have the same dimensions. img_a.shape = {img_a.shape}, img_b.shape = {img_b.shape}"
42
+ )
43
+
44
+ # Flatten the images
45
+ img_a_flat = img_a.reshape(-1, img_a.shape[-1])
46
+ img_b_flat = img_b.reshape(-1, img_b.shape[-1])
47
+
48
+ # Calculate color differences with tolerance
49
+ color_diff = np.linalg.norm(img_a_flat - img_b_flat, axis=1) / 255
50
+ within_tolerance = color_diff <= tolerance
51
+
52
+ # Calculate frequencies of all colors
53
+ unique_colors, indices = np.unique(np.concatenate((img_a_flat, img_b_flat), axis=0), axis=0, return_inverse=True)
54
+ color_counts = np.bincount(indices)
55
+
56
+ # Identify colors to ignore based on frequency threshold
57
+ ignore_colors_mask = color_counts > (len(img_a_flat) + len(img_b_flat)) * threshold / 2
58
+ ignore_in_a = ignore_colors_mask[indices[: len(img_a_flat)]]
59
+ ignore_in_b = ignore_colors_mask[indices[len(img_a_flat) :]]
60
+
61
+ # Apply ignore mask
62
+ valid_pixels = np.logical_not(np.logical_or(ignore_in_a, ignore_in_b)) & within_tolerance
63
+
64
+ # Calculate similarity
65
+ similarity = np.mean(valid_pixels) if len(valid_pixels) > 0 else 0
66
+
67
+ return similarity
68
+
69
+
70
+ def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
71
+ """
72
+ Use ORB features to measure image similarity between two numpy arrays representing images.
73
+
74
+ Args:
75
+ img_a (np.ndarray): the first image
76
+ img_b (np.ndarray): the second image
77
+ Returns:
78
+ float: the ORB similarity between the images
79
+ """
80
+ if len(img_a.shape) < 3 or len(img_b.shape) < 3:
81
+ raise ValueError("Both images must have 3 channels")
82
+
83
+ # Initialize the ORB feature detector
84
+ orb = cv2.ORB_create() if hasattr(cv2, "ORB_create") else cv2.ORB()
85
+
86
+ # Find the keypoints and descriptors with ORB
87
+ _, desc_a = orb.detectAndCompute(img_a, None)
88
+ _, desc_b = orb.detectAndCompute(img_b, None)
89
+
90
+ # Initialize the brute force matcher
91
+ bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
92
+
93
+ # Match descriptors.
94
+ matches = bf.match(desc_a, desc_b)
95
+
96
+ # Calculate similarity based on the distance of the matches
97
+ similar_regions = [i for i in matches if i.distance < 70]
98
+ if len(matches) == 0:
99
+ return 0
100
+ return len(similar_regions) / len(matches)