crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,501 @@
1
+ """Run spec functions for evaluating Vision-Language Models."""
2
+
3
+ from typing import List, Optional, Dict
4
+
5
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
+ from helm.benchmark.adaptation.adapters.adapter_factory import (
7
+ ADAPT_GENERATION_MULTIMODAL,
8
+ ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
9
+ )
10
+ from helm.benchmark.metrics.common_metric_specs import (
11
+ get_basic_reference_metric_specs,
12
+ get_exact_match_metric_specs,
13
+ get_open_ended_generation_metric_specs,
14
+ )
15
+ from helm.benchmark.metrics.metric import MetricSpec
16
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
17
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
18
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
19
+
20
+
21
+ ############################################################
22
+ # Prototypical adapter specs for VLM evaluation
23
+
24
+
25
+ def get_generation_adapter_spec(
26
+ instructions: str = "",
27
+ input_prefix: str = "",
28
+ input_suffix: str = "",
29
+ output_prefix: str = "",
30
+ output_suffix: str = "",
31
+ max_tokens: int = 100,
32
+ stop_sequences: Optional[List[str]] = None,
33
+ ) -> AdapterSpec:
34
+ return AdapterSpec(
35
+ method=ADAPT_GENERATION_MULTIMODAL,
36
+ global_prefix="",
37
+ instructions=instructions,
38
+ input_prefix=input_prefix,
39
+ input_suffix=input_suffix,
40
+ output_prefix=output_prefix,
41
+ output_suffix=output_suffix,
42
+ instance_prefix="\n",
43
+ # We focus on zero-shot evaluation for now as most open VLMs only support a single image input
44
+ max_train_instances=0,
45
+ num_outputs=1,
46
+ max_tokens=max_tokens,
47
+ stop_sequences=stop_sequences if stop_sequences is not None else [],
48
+ random=None,
49
+ )
50
+
51
+
52
+ def get_short_answer_generation_adapter_spec():
53
+ return get_generation_adapter_spec(
54
+ instructions="Just give a short answer without answering in a complete sentence.",
55
+ max_tokens=20,
56
+ )
57
+
58
+
59
+ def get_multiple_choice_joint_adapter_spec(
60
+ input_noun: Optional[str],
61
+ output_noun: str,
62
+ max_train_instances: int = 0,
63
+ num_outputs: int = 1,
64
+ ) -> AdapterSpec:
65
+ return AdapterSpec(
66
+ method=ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
67
+ global_prefix="",
68
+ instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
69
+ input_prefix=f"{input_noun}: " if input_noun is not None else "",
70
+ input_suffix="\n",
71
+ output_prefix=f"{output_noun}: ",
72
+ output_suffix="\n",
73
+ instance_prefix="\n",
74
+ max_train_instances=max_train_instances,
75
+ num_outputs=num_outputs,
76
+ max_tokens=1,
77
+ stop_sequences=["\n"],
78
+ temperature=0.0,
79
+ random=None,
80
+ )
81
+
82
+
83
+ ############################################################
84
+ # VHELM metric specs
85
+
86
+
87
+ def get_image2structure_metric_specs(
88
+ generation_type: str,
89
+ metric_names: Optional[List[str]] = None,
90
+ args: Optional[Dict] = None,
91
+ include_edit_similarity: bool = True,
92
+ size_handling_method: str = "resize",
93
+ ) -> List[MetricSpec]:
94
+ from helm.benchmark.metrics.vision_language.image_metrics import AnnotatedImageMetrics
95
+
96
+ if metric_names is None:
97
+ metric_names = [
98
+ AnnotatedImageMetrics.PIXEL_SIMILARITY,
99
+ AnnotatedImageMetrics.FID_SIMILARITY,
100
+ AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
101
+ ]
102
+ if include_edit_similarity:
103
+ metric_names.append(AnnotatedImageMetrics.EDIT_SIMILARITY)
104
+ if args is None:
105
+ args = {}
106
+ metric_specs = [
107
+ MetricSpec(
108
+ class_name="helm.benchmark.metrics.vision_language.image_metrics.AnnotatedImageMetrics",
109
+ args={
110
+ "generation_type": generation_type,
111
+ "metric_names": metric_names,
112
+ "size_handling_method": size_handling_method,
113
+ **args,
114
+ },
115
+ ),
116
+ ]
117
+ return metric_specs + get_basic_reference_metric_specs()
118
+
119
+
120
+ ############################################################
121
+ # VHELM run specs
122
+
123
+
124
+ @run_spec_function("chart2csv")
125
+ def get_chart2csv_spec() -> RunSpec:
126
+ scenario_spec = ScenarioSpec(
127
+ class_name="helm.benchmark.scenarios.vision_language.image2structure.chart2csv_scenario.Chart2CSVScenario",
128
+ args={},
129
+ )
130
+ adapter_spec: AdapterSpec = get_generation_adapter_spec(
131
+ instructions="Generate the CSV for the chart. Some of the labels may be missing due to the size of the chart. "
132
+ "Please infer the missing labels based on the surrounding context. "
133
+ "Just give the CSV without any explanation.",
134
+ max_tokens=1000,
135
+ )
136
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
137
+
138
+ run_spec_name: str = "chart2csv"
139
+ return RunSpec(
140
+ name=run_spec_name,
141
+ scenario_spec=scenario_spec,
142
+ adapter_spec=adapter_spec,
143
+ metric_specs=metric_specs,
144
+ groups=[run_spec_name],
145
+ )
146
+
147
+
148
+ @run_spec_function("hateful_memes")
149
+ def get_hateful_memes_spec() -> RunSpec:
150
+ scenario_spec = ScenarioSpec(
151
+ class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={}
152
+ )
153
+ adapter_spec: AdapterSpec = get_generation_adapter_spec(
154
+ instructions="Answer Yes or No without an explanation.",
155
+ max_tokens=3,
156
+ )
157
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
158
+
159
+ run_spec_name: str = "hateful_memes"
160
+ return RunSpec(
161
+ name=run_spec_name,
162
+ scenario_spec=scenario_spec,
163
+ adapter_spec=adapter_spec,
164
+ metric_specs=metric_specs,
165
+ groups=[run_spec_name],
166
+ )
167
+
168
+
169
+ @run_spec_function("viz_wiz")
170
+ def get_viz_wiz_spec() -> RunSpec:
171
+ scenario_spec = ScenarioSpec(
172
+ class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
173
+ )
174
+ adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
175
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
176
+
177
+ run_spec_name: str = "viz_wiz"
178
+ return RunSpec(
179
+ name=run_spec_name,
180
+ scenario_spec=scenario_spec,
181
+ adapter_spec=adapter_spec,
182
+ metric_specs=metric_specs,
183
+ groups=[run_spec_name],
184
+ )
185
+
186
+
187
+ @run_spec_function("vqa")
188
+ def get_vqa_spec() -> RunSpec:
189
+ scenario_spec = ScenarioSpec(
190
+ class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
191
+ )
192
+ adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
193
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs()
194
+
195
+ run_spec_name: str = "vqa"
196
+ return RunSpec(
197
+ name=run_spec_name,
198
+ scenario_spec=scenario_spec,
199
+ adapter_spec=adapter_spec,
200
+ metric_specs=metric_specs,
201
+ groups=[run_spec_name],
202
+ )
203
+
204
+
205
+ @run_spec_function("image2latex")
206
+ def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
207
+ scenario_spec = ScenarioSpec(
208
+ class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
209
+ args={"subset": subset, "recompile_prompt": recompile_prompt},
210
+ )
211
+ adapter_spec: AdapterSpec = get_generation_adapter_spec(
212
+ instructions="Just give a short answer without answering in a complete sentence.",
213
+ max_tokens=2000,
214
+ )
215
+ metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
216
+ generation_type="latex",
217
+ args=args,
218
+ include_edit_similarity=True,
219
+ size_handling_method="padding",
220
+ )
221
+ annotator_specs: List[AnnotatorSpec] = [
222
+ AnnotatorSpec(
223
+ class_name="helm.benchmark.annotation.image2structure.latex_compiler_annotator.LatexCompilerAnnotator",
224
+ )
225
+ ]
226
+
227
+ run_spec_name: str = "image2latex"
228
+ return RunSpec(
229
+ name=f"{run_spec_name}:subset={subset}",
230
+ scenario_spec=scenario_spec,
231
+ adapter_spec=adapter_spec,
232
+ metric_specs=metric_specs,
233
+ groups=[run_spec_name],
234
+ annotators=annotator_specs,
235
+ )
236
+
237
+
238
+ @run_spec_function("image2webpage")
239
+ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
240
+ scenario_spec = ScenarioSpec(
241
+ class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
242
+ args={"subset": subset, "recompile_prompt": recompile_prompt},
243
+ )
244
+ adapter_spec: AdapterSpec = get_generation_adapter_spec(
245
+ instructions="Just give a short answer without answering in a complete sentence.",
246
+ max_tokens=2000,
247
+ )
248
+ metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
249
+ generation_type="webpage",
250
+ args=args,
251
+ include_edit_similarity=True,
252
+ size_handling_method="none",
253
+ )
254
+ annotator_specs: List[AnnotatorSpec] = [
255
+ AnnotatorSpec(
256
+ class_name="helm.benchmark.annotation.image2structure.webpage_compiler_annotator.WebpageCompilerAnnotator",
257
+ )
258
+ ]
259
+
260
+ run_spec_name: str = "image2webpage"
261
+ return RunSpec(
262
+ name=f"{run_spec_name}:subset={subset}",
263
+ scenario_spec=scenario_spec,
264
+ adapter_spec=adapter_spec,
265
+ metric_specs=metric_specs,
266
+ groups=[run_spec_name],
267
+ annotators=annotator_specs,
268
+ )
269
+
270
+
271
+ @run_spec_function("image2musicsheet")
272
+ def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
273
+ scenario_spec = ScenarioSpec(
274
+ class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
275
+ args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets
276
+ )
277
+ adapter_spec: AdapterSpec = get_generation_adapter_spec(
278
+ instructions="Just give a short answer without answering in a complete sentence.",
279
+ max_tokens=2000,
280
+ )
281
+ metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
282
+ generation_type="lilypond",
283
+ args=args,
284
+ include_edit_similarity=False, # No ground truth for music sheets
285
+ size_handling_method="padding",
286
+ )
287
+ annotator_specs: List[AnnotatorSpec] = [
288
+ AnnotatorSpec(
289
+ class_name="helm.benchmark.annotation.image2structure.lilypond_compiler_annotator.LilypondCompilerAnnotator", # noqa: E501
290
+ )
291
+ ]
292
+
293
+ run_spec_name: str = "image2musicsheet"
294
+ return RunSpec(
295
+ name=f"{run_spec_name}",
296
+ scenario_spec=scenario_spec,
297
+ adapter_spec=adapter_spec,
298
+ metric_specs=metric_specs,
299
+ groups=[run_spec_name],
300
+ annotators=annotator_specs,
301
+ )
302
+
303
+
304
+ @run_spec_function("mmmu")
305
+ def get_mmmu_spec(subject: str, question_type: str) -> RunSpec:
306
+ scenario_spec = ScenarioSpec(
307
+ class_name="helm.benchmark.scenarios.vision_language.mmmu_scenario.MMMUScenario",
308
+ args={"subject": subject, "question_type": question_type},
309
+ )
310
+
311
+ adapter_spec: AdapterSpec
312
+ if question_type == "open":
313
+ adapter_spec = get_short_answer_generation_adapter_spec()
314
+ elif question_type == "multiple-choice":
315
+ adapter_spec = get_multiple_choice_joint_adapter_spec(
316
+ input_noun=None, output_noun="Answer", max_train_instances=0
317
+ )
318
+ else:
319
+ raise ValueError(f"Invalid question type: {question_type}")
320
+
321
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
322
+ run_spec_name: str = "mmmu"
323
+ return RunSpec(
324
+ name=f"{run_spec_name}:subject={subject},question_type={question_type}",
325
+ scenario_spec=scenario_spec,
326
+ adapter_spec=adapter_spec,
327
+ metric_specs=metric_specs,
328
+ groups=[run_spec_name],
329
+ )
330
+
331
+
332
+ @run_spec_function("unicorn")
333
+ def get_unicorn_spec(subject: str) -> RunSpec:
334
+ scenario_spec = ScenarioSpec(
335
+ class_name="helm.benchmark.scenarios.vision_language.unicorn_scenario.UnicornScenario",
336
+ args={"subject": subject},
337
+ )
338
+ adapter_spec: AdapterSpec = get_generation_adapter_spec(
339
+ instructions="Only give numerical or boolean answer without an explanation."
340
+ )
341
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
342
+
343
+ run_spec_name: str = "unicorn"
344
+ return RunSpec(
345
+ name=run_spec_name,
346
+ scenario_spec=scenario_spec,
347
+ adapter_spec=adapter_spec,
348
+ metric_specs=metric_specs,
349
+ groups=[run_spec_name],
350
+ )
351
+
352
+
353
+ @run_spec_function("bingo")
354
+ def get_bingo_spec(subject: str) -> RunSpec:
355
+ scenario_spec = ScenarioSpec(
356
+ class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
357
+ )
358
+ adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
359
+ metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
360
+
361
+ run_spec_name: str = "bingo"
362
+ return RunSpec(
363
+ name=run_spec_name,
364
+ scenario_spec=scenario_spec,
365
+ adapter_spec=adapter_spec,
366
+ metric_specs=metric_specs,
367
+ groups=[run_spec_name],
368
+ )
369
+
370
+
371
+ @run_spec_function("multipanelvqa")
372
+ def get_multipanelvqa_spec(subject: str, question_type: str) -> RunSpec:
373
+ scenario_spec = ScenarioSpec(
374
+ class_name="helm.benchmark.scenarios.vision_language.multipanelvqa_scenario.MultipanelVQAScenario",
375
+ args={"subject": subject, "question_type": question_type},
376
+ )
377
+
378
+ adapter_spec: AdapterSpec
379
+ if question_type == "open":
380
+ adapter_spec = get_short_answer_generation_adapter_spec()
381
+ elif question_type == "multiple-choice":
382
+ adapter_spec = get_multiple_choice_joint_adapter_spec(
383
+ input_noun=None, output_noun="Answer", max_train_instances=0
384
+ )
385
+ else:
386
+ raise ValueError(f"Invalid question type: {question_type}")
387
+
388
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
389
+ run_spec_name: str = "multipanelvqa"
390
+ return RunSpec(
391
+ name=f"{run_spec_name}:subject={subject},question_type={question_type}",
392
+ scenario_spec=scenario_spec,
393
+ adapter_spec=adapter_spec,
394
+ metric_specs=metric_specs,
395
+ groups=[run_spec_name],
396
+ )
397
+
398
+
399
+ @run_spec_function("pope")
400
+ def get_pope_spec() -> RunSpec:
401
+ scenario_spec = ScenarioSpec(
402
+ class_name="helm.benchmark.scenarios.vision_language.pope_scenario.POPEScenario",
403
+ )
404
+ adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
405
+ input_noun=None, output_noun="Answer", max_train_instances=0
406
+ )
407
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
408
+
409
+ run_spec_name: str = "pope"
410
+ return RunSpec(
411
+ name=run_spec_name,
412
+ scenario_spec=scenario_spec,
413
+ adapter_spec=adapter_spec,
414
+ metric_specs=metric_specs,
415
+ groups=[run_spec_name],
416
+ )
417
+
418
+
419
+ @run_spec_function("seed_bench")
420
+ def get_seed_bench_spec(subject: str) -> RunSpec:
421
+ scenario_spec = ScenarioSpec(
422
+ class_name="helm.benchmark.scenarios.vision_language.seed_bench_scenario.SEEDBenchScenario",
423
+ args={"subject": subject},
424
+ )
425
+ adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
426
+ input_noun=None, output_noun="Answer", max_train_instances=0
427
+ )
428
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
429
+
430
+ run_spec_name: str = "seed_bench"
431
+ return RunSpec(
432
+ name=run_spec_name,
433
+ scenario_spec=scenario_spec,
434
+ adapter_spec=adapter_spec,
435
+ metric_specs=metric_specs,
436
+ groups=[run_spec_name],
437
+ )
438
+
439
+
440
+ @run_spec_function("mme")
441
+ def get_mme_spec(subject: str) -> RunSpec:
442
+ scenario_spec = ScenarioSpec(
443
+ class_name="helm.benchmark.scenarios.vision_language.mme_scenario.MMEScenario",
444
+ args={"subject": subject},
445
+ )
446
+ adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
447
+ input_noun=None, output_noun="Answer", max_train_instances=0
448
+ )
449
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
450
+
451
+ run_spec_name: str = "mme"
452
+ return RunSpec(
453
+ name=run_spec_name,
454
+ scenario_spec=scenario_spec,
455
+ adapter_spec=adapter_spec,
456
+ metric_specs=metric_specs,
457
+ groups=[run_spec_name],
458
+ )
459
+
460
+
461
+ @run_spec_function("heim_human_eval")
462
+ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
463
+ scenario_spec = ScenarioSpec(
464
+ class_name="helm.benchmark.scenarios.vision_language.heim_human_eval_scenario.HEIMHumanEvalScenario",
465
+ args={"question_type": question_type},
466
+ )
467
+ adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
468
+ input_noun=None,
469
+ output_noun="Answer",
470
+ num_outputs=1,
471
+ max_train_instances=0,
472
+ )
473
+ metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
474
+
475
+ run_spec_name: str = "heim_human_eval"
476
+ return RunSpec(
477
+ name=f"{run_spec_name}:question_type={question_type}",
478
+ scenario_spec=scenario_spec,
479
+ adapter_spec=adapter_spec,
480
+ metric_specs=metric_specs,
481
+ groups=[run_spec_name],
482
+ )
483
+
484
+
485
+ @run_spec_function("mementos")
486
+ def get_mementos_spec(subject: str) -> RunSpec:
487
+ scenario_spec = ScenarioSpec(
488
+ class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
489
+ args={"subject": subject},
490
+ )
491
+ adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
492
+ metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
493
+
494
+ run_spec_name: str = "mementos"
495
+ return RunSpec(
496
+ name=run_spec_name,
497
+ scenario_spec=scenario_spec,
498
+ adapter_spec=adapter_spec,
499
+ metric_specs=metric_specs,
500
+ groups=[run_spec_name],
501
+ )
helm/benchmark/runner.py CHANGED
@@ -6,45 +6,45 @@ import traceback
6
6
  import typing
7
7
  from collections import Counter
8
8
  import dataclasses
9
- from dataclasses import dataclass, field
10
9
  from typing import Any, Dict, List
11
10
  import numpy as np
12
11
 
13
12
  from tqdm import tqdm
14
13
 
14
+ from helm.benchmark.adaptation.request_state import RequestState
15
15
  from helm.common.general import ensure_directory_exists, write, asdict_without_nones
16
16
  from helm.common.hierarchical_logger import hlog, htrack_block
17
17
  from helm.common.cache import cache_stats
18
- from .augmentations.data_augmenter import DataAugmenterSpec
19
- from .scenarios.scenario import (
18
+ from helm.benchmark.scenarios.scenario import (
20
19
  EVAL_SPLITS,
21
20
  TRAIN_SPLIT,
22
21
  Scenario,
23
- ScenarioSpec,
24
22
  create_scenario,
25
23
  Instance,
26
24
  get_scenario_cache_path,
27
25
  with_instance_ids,
28
26
  )
29
- from .adaptation.adapters.adapter import Adapter
30
- from .adaptation.adapters.adapter_factory import AdapterFactory
31
- from .adaptation.scenario_state import ScenarioState
32
- from .adaptation.adapter_spec import AdapterSpec
33
- from .data_preprocessor import DataPreprocessor
34
- from .executor import ExecutionSpec, Executor
35
- from .metrics.dry_run_metrics import DryRunMetric
36
- from .metrics.metric_name import MetricName
37
- from .metrics.metric_service import MetricService
38
- from .metrics.metric import Metric, MetricSpec, MetricResult, PerInstanceStats, create_metric, Stat
39
- from .window_services.tokenizer_service import TokenizerService
27
+ from helm.benchmark.adaptation.adapters.adapter import Adapter
28
+ from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
29
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
30
+ from helm.benchmark.run_spec import RunSpec
31
+ from helm.benchmark.data_preprocessor import DataPreprocessor
32
+ from helm.benchmark.executor import ExecutionSpec, Executor
33
+ from helm.benchmark.annotation_executor import AnnotationExecutionSpec, AnnotationExecutor
34
+ from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
35
+ from helm.benchmark.metrics.metric_name import MetricName
36
+ from helm.benchmark.metrics.metric_service import MetricService
37
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
38
+ from helm.benchmark.window_services.tokenizer_service import TokenizerService
40
39
 
41
40
 
42
41
  LATEST_SYMLINK: str = "latest"
43
42
  _BENCHMARK_OUTPUT_PATH: str = "benchmark_output"
43
+ _CACHED_MODELS_FOLDER: str = "models"
44
44
 
45
45
 
46
46
  def get_benchmark_output_path() -> str:
47
- """Get the genchmark output path.
47
+ """Get the benchmark output path.
48
48
 
49
49
  Many run spec functions need to know the benchmark output path,
50
50
  but there is no way to pass it via the run spec function,
@@ -52,8 +52,15 @@ def get_benchmark_output_path() -> str:
52
52
  return _BENCHMARK_OUTPUT_PATH
53
53
 
54
54
 
55
+ def get_cached_models_path() -> str:
56
+ """Get the cached models pat within the benchmark output path."""
57
+ path: str = os.path.join(get_benchmark_output_path(), _CACHED_MODELS_FOLDER)
58
+ ensure_directory_exists(path)
59
+ return path
60
+
61
+
55
62
  def set_benchmark_output_path(benchmark_output_path: str) -> None:
56
- """Set the genchmark output path."""
63
+ """Set the benchmark output path."""
57
64
  global _BENCHMARK_OUTPUT_PATH
58
65
  _BENCHMARK_OUTPUT_PATH = benchmark_output_path
59
66
 
@@ -64,40 +71,6 @@ class RunnerError(Exception):
64
71
  pass
65
72
 
66
73
 
67
- @dataclass(frozen=True)
68
- class RunSpec:
69
- """
70
- Specifies how to do a single run, which gets a scenario, adapts it, and
71
- computes a list of stats based on the defined metrics.
72
- """
73
-
74
- # Unique identifier of the RunSpec
75
- name: str
76
-
77
- # Which scenario
78
- scenario_spec: ScenarioSpec
79
-
80
- # Specifies how to adapt an instance into a set of requests
81
- adapter_spec: AdapterSpec
82
-
83
- # What to evaluate on
84
- metric_specs: List[MetricSpec]
85
-
86
- # Data augmenter. The default `DataAugmenterSpec` does nothing.
87
- data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
88
-
89
- # Groups that this run spec belongs to (for aggregation)
90
- groups: List[str] = field(default_factory=list)
91
-
92
- def __post_init__(self):
93
- """
94
- `self.name` is used as the name of the output folder for the `RunSpec`.
95
- Clean up `self.name` by replacing any "/"'s with "_".
96
- """
97
- # TODO: Don't mutate name! clean this up before passing it into the constructor here
98
- object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
99
-
100
-
101
74
  def remove_stats_nans(stats: List[Stat]) -> List[Stat]:
102
75
  """Return a new list of stats with stats with NaNs removed.
103
76
 
@@ -129,7 +102,9 @@ def remove_per_instance_stats_nans(per_instance_stats_list: List[PerInstanceStat
129
102
  return result
130
103
 
131
104
 
132
- def downsample_eval_instances(instances: List[Instance], max_eval_instances: int) -> List[Instance]:
105
+ def downsample_eval_instances(
106
+ instances: List[Instance], max_eval_instances: int, eval_splits: List[str]
107
+ ) -> List[Instance]:
133
108
  """
134
109
  Get the instances necessary for this run:
135
110
  Train instances (split=train): keep all (if any) for in-context learning
@@ -138,7 +113,7 @@ def downsample_eval_instances(instances: List[Instance], max_eval_instances: int
138
113
  """
139
114
  all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
140
115
 
141
- all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in EVAL_SPLITS]
116
+ all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in eval_splits]
142
117
  if len(all_eval_instances) > max_eval_instances:
143
118
  # The random sampling includes instances monotonically.
144
119
  np.random.seed(0)
@@ -179,6 +154,15 @@ class Runner:
179
154
  exit_on_error: bool,
180
155
  ):
181
156
  self.executor = Executor(execution_spec)
157
+ self.annotator_executor = AnnotationExecutor(
158
+ AnnotationExecutionSpec(
159
+ local_path=execution_spec.local_path if execution_spec.local_path is not None else "",
160
+ parallelism=execution_spec.parallelism,
161
+ dry_run=execution_spec.dry_run,
162
+ sqlite_cache_backend_config=execution_spec.sqlite_cache_backend_config,
163
+ mongo_cache_backend_config=execution_spec.mongo_cache_backend_config,
164
+ )
165
+ )
182
166
  self.dry_run: bool = execution_spec.dry_run
183
167
  self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
184
168
  self.metric_service = MetricService(self.executor.service, execution_spec.auth)
@@ -280,12 +264,14 @@ class Runner:
280
264
  return # Exit after saving the instances.
281
265
 
282
266
  # Give each instance a unique ID
283
- instances = with_instance_ids(instances)
267
+ if any([instance.id is None for instance in instances]):
268
+ instances = with_instance_ids(instances)
284
269
 
285
270
  # Get the instances necessary for this run.
286
271
  max_eval_instances = run_spec.adapter_spec.max_eval_instances
272
+ eval_splits = run_spec.adapter_spec.eval_splits or EVAL_SPLITS
287
273
  if max_eval_instances is not None:
288
- instances = downsample_eval_instances(instances, max_eval_instances)
274
+ instances = downsample_eval_instances(instances, max_eval_instances, eval_splits)
289
275
 
290
276
  # Data preprocessing
291
277
  instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
@@ -294,15 +280,23 @@ class Runner:
294
280
 
295
281
  # Adapt (convert to requests)
296
282
  adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
297
- scenario_state: ScenarioState = adapter.adapt(instances, self.executor.execution_spec.parallelism)
283
+ request_states: List[RequestState] = adapter.adapt(instances, self.executor.execution_spec.parallelism)
284
+ scenario_state: ScenarioState = ScenarioState(
285
+ adapter_spec=run_spec.adapter_spec,
286
+ request_states=request_states,
287
+ annotator_specs=run_spec.annotators,
288
+ )
298
289
 
299
290
  # Execute (fill up results)
300
291
  scenario_state = self.executor.execute(scenario_state)
301
292
 
293
+ # Annotate (post-process the results)
294
+ scenario_state = self.annotator_executor.execute(scenario_state)
295
+
302
296
  # Apply the metrics
303
297
  # When performing a dry run, only estimate the number of tokens instead
304
298
  # of calculating the metrics.
305
- metrics: List[Metric] = (
299
+ metrics: List[MetricInterface] = (
306
300
  [DryRunMetric()] if self.dry_run else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
307
301
  )
308
302
  stats: List[Stat] = []