crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -12,15 +12,16 @@ from helm.benchmark.model_metadata_registry import (
12
12
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
13
13
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
14
14
  ABLATION_MODEL_TAG,
15
+ TEXT_TO_IMAGE_MODEL_TAG,
15
16
  VISION_LANGUAGE_MODEL_TAG,
16
17
  )
17
18
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
18
- from helm.common.general import handle_module_not_found_error
19
19
  from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
20
- from .runner import RunSpec
21
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
20
+ from .run_spec import RunSpec
21
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
22
22
  from .augmentations.perturbation import PerturbationSpec
23
23
  from .augmentations.data_augmenter import DataAugmenterSpec
24
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
24
25
 
25
26
 
26
27
  class RunExpander(ABC):
@@ -223,12 +224,16 @@ class AddToStopRunExpander(RunExpander):
223
224
  self.value = value
224
225
 
225
226
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
227
+ if self.value == "newline":
228
+ stop_sequence = "\n"
229
+ else:
230
+ stop_sequence = self.value
226
231
  return [
227
232
  replace(
228
233
  run_spec,
229
234
  name=run_spec.name,
230
235
  adapter_spec=replace(
231
- run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [self.value]
236
+ run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
232
237
  ),
233
238
  ),
234
239
  ]
@@ -273,33 +278,37 @@ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
273
278
  )
274
279
 
275
280
 
276
- class AnthropicRunExpander(RunExpander):
281
+ class AnthropicClaude2RunExpander(RunExpander):
277
282
  """
278
- Custom prompt for Anthropic models.
283
+ Custom prompt for Anthropic Claude 1 and Claude 2 models.
279
284
  These models need more explicit instructions about following the format.
280
285
  """
281
286
 
282
287
  name = "anthropic"
283
288
 
289
+ # These strings must be added to the prompt in order to pass prompt validation,
290
+ # otherwise the Anthropic API will return an error.
291
+ # See: https://docs.anthropic.com/claude/reference/prompt-validation
292
+ HUMAN_PROMPT = "\n\nHuman:"
293
+ AI_PROMPT = "\n\nAssistant:"
294
+
284
295
  def __init__(self):
285
296
  pass
286
297
 
287
298
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
288
- try:
289
- import anthropic
290
- except ModuleNotFoundError as e:
291
- handle_module_not_found_error(e, ["anthropic"])
292
-
293
299
  return [
294
300
  replace(
295
301
  run_spec,
296
302
  name=run_spec.name,
297
303
  adapter_spec=replace(
298
304
  run_spec.adapter_spec,
299
- global_prefix=anthropic.HUMAN_PROMPT + " " + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
305
+ global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
306
+ + " "
307
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
308
+ + "\n\n",
300
309
  global_suffix="\n\n"
301
310
  + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
302
- + anthropic.AI_PROMPT
311
+ + AnthropicClaude2RunExpander.AI_PROMPT
303
312
  + " "
304
313
  + run_spec.adapter_spec.output_prefix.strip(),
305
314
  ),
@@ -307,13 +316,32 @@ class AnthropicRunExpander(RunExpander):
307
316
  ]
308
317
 
309
318
 
319
+ class AnthropicClaude3RunExpander(RunExpander):
320
+ """Custom prompts for Anthropic Claude 3 models."""
321
+
322
+ name = "claude_3"
323
+
324
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
325
+ if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
326
+ instructions = "Answer with only a single letter."
327
+ if run_spec.adapter_spec.instructions:
328
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
329
+ return [
330
+ replace(
331
+ run_spec,
332
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
333
+ ),
334
+ ]
335
+ return [run_spec]
336
+
337
+
310
338
  class OpenAIRunExpander(RunExpander):
311
339
  """
312
340
  Custom prompt for OpenAI models.
313
341
  These models need more explicit instructions about following the format.
314
342
  """
315
343
 
316
- # TODO: Refactor out common logic between this and GoogleRunExpander.
344
+ # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
317
345
 
318
346
  name = "openai"
319
347
 
@@ -346,7 +374,7 @@ class GoogleRunExpander(RunExpander):
346
374
  These models need more explicit instructions about following the format.
347
375
  """
348
376
 
349
- # TODO: Refactor out common logic between this and OpenAIRunExpander.
377
+ # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
350
378
 
351
379
  name = "google"
352
380
 
@@ -370,6 +398,102 @@ class GoogleRunExpander(RunExpander):
370
398
  ]
371
399
 
372
400
 
401
+ class MistralRunExpander(RunExpander):
402
+ """Custom prompt for Mistral models."""
403
+
404
+ # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
405
+
406
+ name = "output_format_instructions"
407
+
408
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
409
+ if run_spec.adapter_spec.method != ADAPT_GENERATION:
410
+ return [run_spec]
411
+
412
+ return [
413
+ replace(
414
+ run_spec,
415
+ name=run_spec.name,
416
+ adapter_spec=replace(
417
+ run_spec.adapter_spec,
418
+ global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
419
+ global_suffix="\n\n"
420
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
421
+ + "\n"
422
+ + run_spec.adapter_spec.output_prefix.strip(),
423
+ ),
424
+ ),
425
+ ]
426
+
427
+
428
+ class IDEFICSInstructRunExpander(RunExpander):
429
+ """
430
+ Custom prompt for IDEFICS instruct models which require a specific format.
431
+ See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
432
+ """
433
+
434
+ name = "idefics_instruct"
435
+
436
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
437
+ return [
438
+ replace(
439
+ run_spec,
440
+ name=run_spec.name,
441
+ adapter_spec=replace(
442
+ run_spec.adapter_spec,
443
+ input_prefix="User: ",
444
+ input_suffix="<end_of_utterance>",
445
+ output_prefix="\nAssistant: ",
446
+ output_suffix="<end_of_utterance>",
447
+ stop_sequences=["<end_of_utterance>"],
448
+ ),
449
+ ),
450
+ ]
451
+
452
+
453
+ class LlavaRunExpander(RunExpander):
454
+ """
455
+ Custom prompt for Llava 1.5 models which should use a specific format.
456
+ See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
457
+ """
458
+
459
+ name = "llava"
460
+
461
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
462
+ return [
463
+ replace(
464
+ run_spec,
465
+ name=run_spec.name,
466
+ adapter_spec=replace(
467
+ run_spec.adapter_spec,
468
+ input_prefix="USER: <image>",
469
+ input_suffix="",
470
+ output_prefix="\nASSISTANT: ",
471
+ output_suffix="",
472
+ ),
473
+ ),
474
+ ]
475
+
476
+
477
+ class OpenFlamingoRunExpander(RunExpander):
478
+ """
479
+ Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
480
+ """
481
+
482
+ name = "open_flamingo"
483
+
484
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
485
+ return [
486
+ replace(
487
+ run_spec,
488
+ name=run_spec.name,
489
+ adapter_spec=replace(
490
+ run_spec.adapter_spec,
491
+ input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
492
+ ),
493
+ ),
494
+ ]
495
+
496
+
373
497
  class FormatPromptRunExpander(RunExpander):
374
498
  """Adds a prefix and suffix to the prompt."""
375
499
 
@@ -415,6 +539,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
415
539
  "one": [1],
416
540
  "all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
417
541
  "big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
542
+ "heim_human_eval": [0, 1, 2, 4, 8],
418
543
  }
419
544
 
420
545
 
@@ -422,7 +547,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
422
547
  """For overriding the number of eval instances at the run level."""
423
548
 
424
549
  name = "max_eval_instances"
425
- values_dict: Dict[str, List[Any]] = {}
550
+ values_dict: Dict[str, List[Any]] = {
551
+ "default": [1_000],
552
+ "heim_default": [100],
553
+ "heim_fid": [30_000],
554
+ "heim_art_styles": [17],
555
+ }
426
556
 
427
557
 
428
558
  class NumOutputsRunExpander(ReplaceValueRunExpander):
@@ -435,6 +565,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
435
565
  }
436
566
 
437
567
 
568
+ class NumTrialRunExpander(ReplaceValueRunExpander):
569
+ """For getting different generations for the same requests."""
570
+
571
+ name = "num_trials"
572
+ values_dict = {
573
+ "heim_efficiency": [5],
574
+ }
575
+
576
+
438
577
  class ModelRunExpander(ReplaceValueRunExpander):
439
578
  """
440
579
  For specifying different models.
@@ -476,6 +615,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
476
615
  "openai/text-davinci-003",
477
616
  ],
478
617
  "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
618
+ "text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
479
619
  "vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
480
620
  }
481
621
 
@@ -500,6 +640,33 @@ class ModelDeploymentRunExpander(ReplaceValueRunExpander):
500
640
  values_dict: Dict[str, List[Any]] = {}
501
641
 
502
642
 
643
+ class EvalSplitRunExpander(RunExpander):
644
+ """Sets the evaluation split.
645
+
646
+ By default, evaluation instances are drawn from both test and validation splits.
647
+ This run expander allows drawing evaluation instances from only the test split or
648
+ only the validation split."""
649
+
650
+ # NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
651
+ # run expander name to be "eval_split", not "eval_splits".
652
+
653
+ name = "eval_split"
654
+
655
+ def __init__(self, value):
656
+ if value != TEST_SPLIT and value != VALID_SPLIT:
657
+ raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
658
+ self.split = value
659
+
660
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
661
+ return [
662
+ replace(
663
+ run_spec,
664
+ name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
665
+ adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
666
+ )
667
+ ]
668
+
669
+
503
670
  ############################################################
504
671
 
505
672
 
@@ -688,6 +855,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
688
855
  )
689
856
 
690
857
 
858
+ def translate(language_code: str) -> PerturbationSpec:
859
+ return PerturbationSpec(
860
+ class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
861
+ args={"language_code": language_code},
862
+ )
863
+
864
+
865
+ def suffix(text: str) -> PerturbationSpec:
866
+ return PerturbationSpec(
867
+ class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
868
+ args={"suffix": text},
869
+ )
870
+
871
+
691
872
  # Specifies the data augmentations that we're interested in trying out.
692
873
  # Concretely, this is a mapping from the name (which is specified in a conf
693
874
  # file or the CLI) to a list of options to try, where each option is a list of perturbations.
@@ -879,6 +1060,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
879
1060
  mandarin_to_cantonese(),
880
1061
  ]
881
1062
  },
1063
+ # Multilinguality
1064
+ "chinese": {"chinese": [translate(language_code="zh-CN")]},
1065
+ "hindi": {"hindi": [translate(language_code="hi")]},
1066
+ "spanish": {"spanish": [translate(language_code="es")]},
1067
+ # Styles
1068
+ "art": {
1069
+ "art": [
1070
+ suffix("oil painting"),
1071
+ suffix("watercolor"),
1072
+ suffix("pencil sketch"),
1073
+ suffix("animation"),
1074
+ suffix("vector graphics"),
1075
+ suffix("pixel art"),
1076
+ ]
1077
+ },
882
1078
  }
883
1079
 
884
1080
 
@@ -1225,6 +1421,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1225
1421
  MaxTrainInstancesRunExpander,
1226
1422
  MaxEvalInstancesRunExpander,
1227
1423
  NumOutputsRunExpander,
1424
+ NumTrialRunExpander,
1228
1425
  ModelRunExpander,
1229
1426
  ModelDeploymentRunExpander,
1230
1427
  DataAugmentationRunExpander,
@@ -1232,6 +1429,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1232
1429
  NumPromptTokensRunExpander,
1233
1430
  NumOutputTokensRunExpander,
1234
1431
  ChatMLRunExpander,
1432
+ EvalSplitRunExpander,
1235
1433
  ]
1236
1434
 
1237
1435
 
@@ -0,0 +1,93 @@
1
+ from dataclasses import dataclass, field
2
+ import importlib
3
+ import os
4
+ import pkgutil
5
+ from typing import Callable, Dict, Iterable, List, Optional, TypeVar
6
+
7
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
+ from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
9
+ from helm.benchmark.metrics.metric import MetricSpec
10
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
11
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class RunSpec:
16
+ """
17
+ Specifies how to do a single run, which gets a scenario, adapts it, and
18
+ computes a list of stats based on the defined metrics.
19
+ """
20
+
21
+ name: str
22
+ """Unique identifier of the RunSpec"""
23
+
24
+ scenario_spec: ScenarioSpec
25
+ """Which scenario"""
26
+
27
+ adapter_spec: AdapterSpec
28
+ """Specifies how to adapt an instance into a set of requests"""
29
+
30
+ metric_specs: List[MetricSpec]
31
+ """What to evaluate on"""
32
+
33
+ data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
34
+ """Data augmenter. The default `DataAugmenterSpec` does nothing."""
35
+
36
+ groups: List[str] = field(default_factory=list)
37
+ """Groups that this run spec belongs to (for aggregation)"""
38
+
39
+ annotators: Optional[List[AnnotatorSpec]] = None
40
+ """Annotators to use for this run spec"""
41
+
42
+ def __post_init__(self):
43
+ """
44
+ `self.name` is used as the name of the output folder for the `RunSpec`.
45
+ Clean up `self.name` by replacing any "/"'s with "_".
46
+ """
47
+ # TODO: Don't mutate name! clean this up before passing it into the constructor here
48
+ object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
49
+
50
+
51
+ RunSpecFunction = Callable[..., RunSpec]
52
+
53
+
54
+ _REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
55
+ """Dict of run spec function names to run spec functions."""
56
+
57
+
58
+ F = TypeVar("F", bound=RunSpecFunction)
59
+
60
+
61
+ def run_spec_function(name: str) -> Callable[[F], F]:
62
+ """Register the run spec function under the given name."""
63
+
64
+ def wrap(func: F) -> F:
65
+ if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
66
+ raise ValueError(f"A run spec function with name {name} already exists")
67
+ _REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
68
+ return func
69
+
70
+ return wrap
71
+
72
+
73
+ # Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
74
+ def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
75
+ # Specifying the second argument (prefix) to iter_modules makes the
76
+ # returned name an absolute name instead of a relative one. This allows
77
+ # import_module to work without having to do additional modification to
78
+ # the name.
79
+ return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
80
+
81
+
82
+ def discover_run_spec_functions() -> None:
83
+ """Discover and register all run spec functions under helm.benchmark.run_specs"""
84
+ import helm.benchmark.run_specs # noqa
85
+
86
+ for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
87
+ importlib.import_module(name)
88
+
89
+
90
+ def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
91
+ """Return the run spec function registered under the given name."""
92
+ discover_run_spec_functions()
93
+ return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)
@@ -0,0 +1,162 @@
1
+ import dataclasses
2
+ from typing import List
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import (
5
+ ADAPT_GENERATION,
6
+ ADAPT_MULTIPLE_CHOICE_JOINT,
7
+ ADAPT_GENERATION_MULTIMODAL,
8
+ )
9
+ from helm.benchmark.model_deployment_registry import (
10
+ ModelDeployment,
11
+ get_default_model_deployment_for_model,
12
+ get_model_deployment,
13
+ )
14
+ from helm.benchmark.model_metadata_registry import (
15
+ ANTHROPIC_CLAUDE_1_MODEL_TAG,
16
+ ANTHROPIC_CLAUDE_2_MODEL_TAG,
17
+ BUGGY_TEMP_0_TAG,
18
+ CHATML_MODEL_TAG,
19
+ GOOGLE_GEMINI_MODEL_TAG,
20
+ IDEFICS_INSTRUCT_MODEL_TAG,
21
+ IDEFICS_MODEL_TAG,
22
+ LLAVA_MODEL_TAG,
23
+ OPEN_FLAMINGO_MODEL_TAG,
24
+ VISION_LANGUAGE_MODEL_TAG,
25
+ NLG_PREFIX_TAG,
26
+ NO_NEWLINES_TAG,
27
+ ModelMetadata,
28
+ get_model_metadata,
29
+ )
30
+ from helm.benchmark.run_expander import (
31
+ RUN_EXPANDERS,
32
+ AnthropicClaude2RunExpander,
33
+ ChatMLRunExpander,
34
+ GlobalPrefixRunExpander,
35
+ IDEFICSInstructRunExpander,
36
+ IncreaseTemperatureRunExpander,
37
+ IncreaseMaxTokensRunExpander,
38
+ LlavaRunExpander,
39
+ OpenFlamingoRunExpander,
40
+ StopRunExpander,
41
+ )
42
+ from helm.benchmark.run_spec import RunSpec, get_run_spec_function
43
+ from helm.common.general import singleton
44
+ from helm.common.object_spec import ObjectSpec
45
+
46
+
47
+ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
48
+ """
49
+ Takes a specification (name, args) and returns a list of `RunSpec`s.
50
+ """
51
+ # Note that we are abusing `spec` a bit because the name is not actually a class name.
52
+ name = spec.class_name
53
+ args = spec.args
54
+
55
+ run_spec_function = get_run_spec_function(name)
56
+ if run_spec_function is None:
57
+ raise ValueError(f"Unknown run spec name: {name}")
58
+
59
+ # Peel off the run expanders (e.g., model)
60
+ expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
61
+ args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
62
+
63
+ run_specs: List[RunSpec] = [run_spec_function(**args)]
64
+
65
+ # Apply expanders
66
+ for expander in expanders:
67
+ run_specs = [
68
+ child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
69
+ ]
70
+
71
+ def alter_run_spec(run_spec: RunSpec) -> RunSpec:
72
+ if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
73
+ raise ValueError("At least one of model_deployment and model must be specified")
74
+ elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
75
+ # Infer model from model deployment
76
+ default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
77
+ if not default_model_name:
78
+ default_model_name = run_spec.adapter_spec.model_deployment
79
+ run_spec = dataclasses.replace(
80
+ run_spec,
81
+ adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
82
+ )
83
+ elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
84
+ # Infer model deployment from model
85
+ default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
86
+ if not default_model_deployment:
87
+ raise ValueError(
88
+ f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
89
+ )
90
+ run_spec = dataclasses.replace(
91
+ run_spec,
92
+ adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
93
+ )
94
+
95
+ # Both model and model_deployment should now be filled
96
+ assert run_spec.adapter_spec.model_deployment
97
+ assert run_spec.adapter_spec.model
98
+
99
+ model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
100
+ deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
101
+ if run_spec.adapter_spec.model != deployment.model_name:
102
+ raise ValueError(
103
+ f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
104
+ f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
105
+ f"for a different model '{deployment.model_name}'"
106
+ )
107
+ # For models that strip newlines, when we're generating, we need to set
108
+ # the delimiter to be '###' so we stop properly.
109
+ if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
110
+ ADAPT_GENERATION,
111
+ ADAPT_MULTIPLE_CHOICE_JOINT,
112
+ ):
113
+ stop_expander = StopRunExpander(value="hash")
114
+ run_spec = singleton(stop_expander.expand(run_spec))
115
+
116
+ if NLG_PREFIX_TAG in model.tags:
117
+ global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
118
+ run_spec = singleton(global_prefix_expander.expand(run_spec))
119
+
120
+ if CHATML_MODEL_TAG in model.tags:
121
+ chatml_expander = ChatMLRunExpander()
122
+ run_spec = singleton(chatml_expander.expand(run_spec))
123
+
124
+ # Anthropic Claude 1 and 2 prompts
125
+ if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
126
+ run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
127
+
128
+ # Google Gemini Vision returns an empty completion or throws an error if max_tokens is 1
129
+ if (
130
+ VISION_LANGUAGE_MODEL_TAG in model.tags
131
+ and GOOGLE_GEMINI_MODEL_TAG in model.tags
132
+ and run_spec.adapter_spec.max_tokens == 1
133
+ ):
134
+ run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
135
+
136
+ # IDEFICS special handling
137
+ if IDEFICS_MODEL_TAG in model.tags:
138
+ # IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
139
+ if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
140
+ run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
141
+
142
+ if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
143
+ run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
144
+
145
+ # Llava
146
+ if LLAVA_MODEL_TAG in model.tags:
147
+ run_spec = singleton(LlavaRunExpander().expand(run_spec))
148
+
149
+ # OpenFlamingo
150
+ if OPEN_FLAMINGO_MODEL_TAG in model.tags:
151
+ run_spec = singleton(OpenFlamingoRunExpander().expand(run_spec))
152
+
153
+ # For multiple choice
154
+ if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
155
+ increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
156
+ run_spec = singleton(increase_temperature_expander.expand(run_spec))
157
+
158
+ return run_spec
159
+
160
+ run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
161
+
162
+ return run_specs
File without changes