crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -3,7 +3,7 @@ from abc import ABC, abstractmethod
3
3
  from dataclasses import replace
4
4
  from typing import Any, List, Dict, Optional, Tuple, Type
5
5
 
6
- from helm.proxy.models import (
6
+ from helm.benchmark.model_metadata_registry import (
7
7
  get_all_instruction_following_models,
8
8
  get_all_code_models,
9
9
  get_all_models,
@@ -11,20 +11,17 @@ from helm.proxy.models import (
11
11
  get_model_names_with_tag,
12
12
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
13
13
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
14
- GPT2_TOKENIZER_TAG,
15
- AI21_TOKENIZER_TAG,
16
- COHERE_TOKENIZER_TAG,
17
- OPT_TOKENIZER_TAG,
18
- GPTJ_TOKENIZER_TAG,
19
- GPTNEO_TOKENIZER_TAG,
20
- GPT4_TOKENIZER_TAG,
21
14
  ABLATION_MODEL_TAG,
15
+ TEXT_TO_IMAGE_MODEL_TAG,
22
16
  VISION_LANGUAGE_MODEL_TAG,
23
17
  )
24
- from .runner import RunSpec
25
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
18
+ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
19
+ from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
20
+ from .run_spec import RunSpec
21
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
26
22
  from .augmentations.perturbation import PerturbationSpec
27
23
  from .augmentations.data_augmenter import DataAugmenterSpec
24
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
28
25
 
29
26
 
30
27
  class RunExpander(ABC):
@@ -227,12 +224,16 @@ class AddToStopRunExpander(RunExpander):
227
224
  self.value = value
228
225
 
229
226
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
227
+ if self.value == "newline":
228
+ stop_sequence = "\n"
229
+ else:
230
+ stop_sequence = self.value
230
231
  return [
231
232
  replace(
232
233
  run_spec,
233
234
  name=run_spec.name,
234
235
  adapter_spec=replace(
235
- run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [self.value]
236
+ run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
236
237
  ),
237
238
  ),
238
239
  ]
@@ -261,6 +262,238 @@ class GlobalPrefixRunExpander(RunExpander):
261
262
  ]
262
263
 
263
264
 
265
+ # Instruction-following models like GPT-4, Claude, PaLM 2 don't do in-context
266
+ # learning naturally like base models, and they prefer to respond in a wordy
267
+ # way as an assistant. Therefore, for these models, we must provide explicit
268
+ # instructions to follow the format of the in-context examples.
269
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX = (
270
+ "Here are some input-output examples. "
271
+ + "Read the examples carefully to figure out the mapping. "
272
+ + "The output of the last example is not given, "
273
+ + "and your job is to figure out what it is."
274
+ )
275
+
276
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
277
+ "Please provide the output to this last example. " + "It is critical to follow the format of the preceding outputs!"
278
+ )
279
+
280
+
281
+ class AnthropicClaude2RunExpander(RunExpander):
282
+ """
283
+ Custom prompt for Anthropic Claude 1 and Claude 2 models.
284
+ These models need more explicit instructions about following the format.
285
+ """
286
+
287
+ name = "anthropic"
288
+
289
+ # These strings must be added to the prompt in order to pass prompt validation,
290
+ # otherwise the Anthropic API will return an error.
291
+ # See: https://docs.anthropic.com/claude/reference/prompt-validation
292
+ HUMAN_PROMPT = "\n\nHuman:"
293
+ AI_PROMPT = "\n\nAssistant:"
294
+
295
+ def __init__(self):
296
+ pass
297
+
298
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
299
+ return [
300
+ replace(
301
+ run_spec,
302
+ name=run_spec.name,
303
+ adapter_spec=replace(
304
+ run_spec.adapter_spec,
305
+ global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
306
+ + " "
307
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
308
+ + "\n\n",
309
+ global_suffix="\n\n"
310
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
311
+ + AnthropicClaude2RunExpander.AI_PROMPT
312
+ + " "
313
+ + run_spec.adapter_spec.output_prefix.strip(),
314
+ ),
315
+ ),
316
+ ]
317
+
318
+
319
+ class AnthropicClaude3RunExpander(RunExpander):
320
+ """Custom prompts for Anthropic Claude 3 models."""
321
+
322
+ name = "claude_3"
323
+
324
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
325
+ if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
326
+ instructions = "Answer with only a single letter."
327
+ if run_spec.adapter_spec.instructions:
328
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
329
+ return [
330
+ replace(
331
+ run_spec,
332
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
333
+ ),
334
+ ]
335
+ return [run_spec]
336
+
337
+
338
+ class OpenAIRunExpander(RunExpander):
339
+ """
340
+ Custom prompt for OpenAI models.
341
+ These models need more explicit instructions about following the format.
342
+ """
343
+
344
+ # TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
345
+
346
+ name = "openai"
347
+
348
+ def __init__(self):
349
+ pass
350
+
351
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
352
+ if run_spec.adapter_spec.method != ADAPT_GENERATION:
353
+ return [run_spec]
354
+
355
+ return [
356
+ replace(
357
+ run_spec,
358
+ name=run_spec.name,
359
+ adapter_spec=replace(
360
+ run_spec.adapter_spec,
361
+ global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
362
+ global_suffix="\n\n"
363
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
364
+ + "\n"
365
+ + run_spec.adapter_spec.output_prefix.strip(),
366
+ ),
367
+ ),
368
+ ]
369
+
370
+
371
+ class GoogleRunExpander(RunExpander):
372
+ """
373
+ Custom prompt for Google models.
374
+ These models need more explicit instructions about following the format.
375
+ """
376
+
377
+ # TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
378
+
379
+ name = "google"
380
+
381
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
382
+ if run_spec.adapter_spec.method != ADAPT_GENERATION:
383
+ return [run_spec]
384
+
385
+ return [
386
+ replace(
387
+ run_spec,
388
+ name=run_spec.name,
389
+ adapter_spec=replace(
390
+ run_spec.adapter_spec,
391
+ global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
392
+ global_suffix="\n\n"
393
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
394
+ + "\n"
395
+ + run_spec.adapter_spec.output_prefix.strip(),
396
+ ),
397
+ ),
398
+ ]
399
+
400
+
401
+ class MistralRunExpander(RunExpander):
402
+ """Custom prompt for Mistral models."""
403
+
404
+ # TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
405
+
406
+ name = "output_format_instructions"
407
+
408
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
409
+ if run_spec.adapter_spec.method != ADAPT_GENERATION:
410
+ return [run_spec]
411
+
412
+ return [
413
+ replace(
414
+ run_spec,
415
+ name=run_spec.name,
416
+ adapter_spec=replace(
417
+ run_spec.adapter_spec,
418
+ global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
419
+ global_suffix="\n\n"
420
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
421
+ + "\n"
422
+ + run_spec.adapter_spec.output_prefix.strip(),
423
+ ),
424
+ ),
425
+ ]
426
+
427
+
428
+ class IDEFICSInstructRunExpander(RunExpander):
429
+ """
430
+ Custom prompt for IDEFICS instruct models which require a specific format.
431
+ See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
432
+ """
433
+
434
+ name = "idefics_instruct"
435
+
436
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
437
+ return [
438
+ replace(
439
+ run_spec,
440
+ name=run_spec.name,
441
+ adapter_spec=replace(
442
+ run_spec.adapter_spec,
443
+ input_prefix="User: ",
444
+ input_suffix="<end_of_utterance>",
445
+ output_prefix="\nAssistant: ",
446
+ output_suffix="<end_of_utterance>",
447
+ stop_sequences=["<end_of_utterance>"],
448
+ ),
449
+ ),
450
+ ]
451
+
452
+
453
+ class LlavaRunExpander(RunExpander):
454
+ """
455
+ Custom prompt for Llava 1.5 models which should use a specific format.
456
+ See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
457
+ """
458
+
459
+ name = "llava"
460
+
461
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
462
+ return [
463
+ replace(
464
+ run_spec,
465
+ name=run_spec.name,
466
+ adapter_spec=replace(
467
+ run_spec.adapter_spec,
468
+ input_prefix="USER: <image>",
469
+ input_suffix="",
470
+ output_prefix="\nASSISTANT: ",
471
+ output_suffix="",
472
+ ),
473
+ ),
474
+ ]
475
+
476
+
477
+ class OpenFlamingoRunExpander(RunExpander):
478
+ """
479
+ Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
480
+ """
481
+
482
+ name = "open_flamingo"
483
+
484
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
485
+ return [
486
+ replace(
487
+ run_spec,
488
+ name=run_spec.name,
489
+ adapter_spec=replace(
490
+ run_spec.adapter_spec,
491
+ input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
492
+ ),
493
+ ),
494
+ ]
495
+
496
+
264
497
  class FormatPromptRunExpander(RunExpander):
265
498
  """Adds a prefix and suffix to the prompt."""
266
499
 
@@ -277,7 +510,7 @@ class FormatPromptRunExpander(RunExpander):
277
510
  name=run_spec.name,
278
511
  adapter_spec=replace(
279
512
  run_spec.adapter_spec,
280
- global_prefix=self.prefix,
513
+ input_prefix=self.prefix,
281
514
  output_prefix=self.suffix,
282
515
  ),
283
516
  ),
@@ -306,6 +539,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
306
539
  "one": [1],
307
540
  "all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
308
541
  "big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
542
+ "heim_human_eval": [0, 1, 2, 4, 8],
309
543
  }
310
544
 
311
545
 
@@ -313,7 +547,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
313
547
  """For overriding the number of eval instances at the run level."""
314
548
 
315
549
  name = "max_eval_instances"
316
- values_dict: Dict[str, List[Any]] = {}
550
+ values_dict: Dict[str, List[Any]] = {
551
+ "default": [1_000],
552
+ "heim_default": [100],
553
+ "heim_fid": [30_000],
554
+ "heim_art_styles": [17],
555
+ }
317
556
 
318
557
 
319
558
  class NumOutputsRunExpander(ReplaceValueRunExpander):
@@ -326,6 +565,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
326
565
  }
327
566
 
328
567
 
568
+ class NumTrialRunExpander(ReplaceValueRunExpander):
569
+ """For getting different generations for the same requests."""
570
+
571
+ name = "num_trials"
572
+ values_dict = {
573
+ "heim_efficiency": [5],
574
+ }
575
+
576
+
329
577
  class ModelRunExpander(ReplaceValueRunExpander):
330
578
  """
331
579
  For specifying different models.
@@ -355,10 +603,6 @@ class ModelRunExpander(ReplaceValueRunExpander):
355
603
  "code": get_all_code_models(),
356
604
  "instruction_following": get_all_instruction_following_models(),
357
605
  "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
358
- "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
359
- "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
360
- "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
361
- "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
362
606
  "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
363
607
  "biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
364
608
  "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
@@ -371,6 +615,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
371
615
  "openai/text-davinci-003",
372
616
  ],
373
617
  "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
618
+ "text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
374
619
  "vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
375
620
  }
376
621
 
@@ -388,6 +633,40 @@ class ModelRunExpander(ReplaceValueRunExpander):
388
633
  return values_dict
389
634
 
390
635
 
636
+ class ModelDeploymentRunExpander(ReplaceValueRunExpander):
637
+ """For overriding model deployment"""
638
+
639
+ name = "model_deployment"
640
+ values_dict: Dict[str, List[Any]] = {}
641
+
642
+
643
+ class EvalSplitRunExpander(RunExpander):
644
+ """Sets the evaluation split.
645
+
646
+ By default, evaluation instances are drawn from both test and validation splits.
647
+ This run expander allows drawing evaluation instances from only the test split or
648
+ only the validation split."""
649
+
650
+ # NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
651
+ # run expander name to be "eval_split", not "eval_splits".
652
+
653
+ name = "eval_split"
654
+
655
+ def __init__(self, value):
656
+ if value != TEST_SPLIT and value != VALID_SPLIT:
657
+ raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
658
+ self.split = value
659
+
660
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
661
+ return [
662
+ replace(
663
+ run_spec,
664
+ name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
665
+ adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
666
+ )
667
+ ]
668
+
669
+
391
670
  ############################################################
392
671
 
393
672
 
@@ -576,6 +855,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
576
855
  )
577
856
 
578
857
 
858
+ def translate(language_code: str) -> PerturbationSpec:
859
+ return PerturbationSpec(
860
+ class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
861
+ args={"language_code": language_code},
862
+ )
863
+
864
+
865
+ def suffix(text: str) -> PerturbationSpec:
866
+ return PerturbationSpec(
867
+ class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
868
+ args={"suffix": text},
869
+ )
870
+
871
+
579
872
  # Specifies the data augmentations that we're interested in trying out.
580
873
  # Concretely, this is a mapping from the name (which is specified in a conf
581
874
  # file or the CLI) to a list of options to try, where each option is a list of perturbations.
@@ -767,6 +1060,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
767
1060
  mandarin_to_cantonese(),
768
1061
  ]
769
1062
  },
1063
+ # Multilinguality
1064
+ "chinese": {"chinese": [translate(language_code="zh-CN")]},
1065
+ "hindi": {"hindi": [translate(language_code="hi")]},
1066
+ "spanish": {"spanish": [translate(language_code="es")]},
1067
+ # Styles
1068
+ "art": {
1069
+ "art": [
1070
+ suffix("oil painting"),
1071
+ suffix("watercolor"),
1072
+ suffix("pencil sketch"),
1073
+ suffix("animation"),
1074
+ suffix("vector graphics"),
1075
+ suffix("pixel art"),
1076
+ ]
1077
+ },
770
1078
  }
771
1079
 
772
1080
 
@@ -880,18 +1188,18 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
880
1188
  "huggingface/santacoder": ["bigcode/santacoder"],
881
1189
  "huggingface/starcoder": ["bigcode/starcoder"],
882
1190
  }
883
- model_tags_and_tokenizers = [
884
- (GPT2_TOKENIZER_TAG, "huggingface/gpt2"),
885
- (AI21_TOKENIZER_TAG, "ai21/j1"),
886
- (COHERE_TOKENIZER_TAG, "cohere/cohere"),
887
- (OPT_TOKENIZER_TAG, "meta/opt"),
888
- (GPTJ_TOKENIZER_TAG, "eleutherai/gptj"),
889
- (GPT4_TOKENIZER_TAG, "openai/cl100k_base"),
890
- (GPTNEO_TOKENIZER_TAG, "eleutherai/gptneox"),
1191
+ list_tokenizers = [
1192
+ "huggingface/gpt2",
1193
+ "ai21/j1",
1194
+ "cohere/cohere",
1195
+ "meta/opt",
1196
+ "eleutherai/gptj",
1197
+ "openai/cl100k_base",
1198
+ "eleutherai/gptneox",
891
1199
  ]
892
- for model_tag, tokenizer in model_tags_and_tokenizers:
893
- for model in get_model_names_with_tag(model_tag):
894
- model_to_tokenizer_mapping[model] = [tokenizer]
1200
+ for tokenizer_name in list_tokenizers:
1201
+ for model in get_model_names_with_tokenizer(tokenizer_name):
1202
+ model_to_tokenizer_mapping[model] = [tokenizer_name]
895
1203
  # tokenizer=default will map to using the right tokenizer for a given model.
896
1204
  values_dict = {"default": model_to_tokenizer_mapping}
897
1205
 
@@ -907,10 +1215,10 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
907
1215
  self.all_values = [value]
908
1216
 
909
1217
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
910
- # Find right tokenizer given model.
1218
+ # Find right tokenizer given model deployment name.
911
1219
  if isinstance(self.all_values, dict):
912
- model: str = run_spec.adapter_spec.model
913
- self.values = self.all_values[model] if model in self.all_values else []
1220
+ deployment: str = run_spec.adapter_spec.model_deployment
1221
+ self.values = self.all_values[deployment] if deployment in self.all_values else []
914
1222
  else:
915
1223
  self.values = self.all_values
916
1224
  return super().expand(run_spec)
@@ -1113,12 +1421,15 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1113
1421
  MaxTrainInstancesRunExpander,
1114
1422
  MaxEvalInstancesRunExpander,
1115
1423
  NumOutputsRunExpander,
1424
+ NumTrialRunExpander,
1116
1425
  ModelRunExpander,
1426
+ ModelDeploymentRunExpander,
1117
1427
  DataAugmentationRunExpander,
1118
1428
  TokenizerRunExpander,
1119
1429
  NumPromptTokensRunExpander,
1120
1430
  NumOutputTokensRunExpander,
1121
1431
  ChatMLRunExpander,
1432
+ EvalSplitRunExpander,
1122
1433
  ]
1123
1434
 
1124
1435
 
@@ -0,0 +1,93 @@
1
+ from dataclasses import dataclass, field
2
+ import importlib
3
+ import os
4
+ import pkgutil
5
+ from typing import Callable, Dict, Iterable, List, Optional, TypeVar
6
+
7
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
+ from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
9
+ from helm.benchmark.metrics.metric import MetricSpec
10
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
11
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class RunSpec:
16
+ """
17
+ Specifies how to do a single run, which gets a scenario, adapts it, and
18
+ computes a list of stats based on the defined metrics.
19
+ """
20
+
21
+ name: str
22
+ """Unique identifier of the RunSpec"""
23
+
24
+ scenario_spec: ScenarioSpec
25
+ """Which scenario"""
26
+
27
+ adapter_spec: AdapterSpec
28
+ """Specifies how to adapt an instance into a set of requests"""
29
+
30
+ metric_specs: List[MetricSpec]
31
+ """What to evaluate on"""
32
+
33
+ data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
34
+ """Data augmenter. The default `DataAugmenterSpec` does nothing."""
35
+
36
+ groups: List[str] = field(default_factory=list)
37
+ """Groups that this run spec belongs to (for aggregation)"""
38
+
39
+ annotators: Optional[List[AnnotatorSpec]] = None
40
+ """Annotators to use for this run spec"""
41
+
42
+ def __post_init__(self):
43
+ """
44
+ `self.name` is used as the name of the output folder for the `RunSpec`.
45
+ Clean up `self.name` by replacing any "/"'s with "_".
46
+ """
47
+ # TODO: Don't mutate name! clean this up before passing it into the constructor here
48
+ object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
49
+
50
+
51
+ RunSpecFunction = Callable[..., RunSpec]
52
+
53
+
54
+ _REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
55
+ """Dict of run spec function names to run spec functions."""
56
+
57
+
58
+ F = TypeVar("F", bound=RunSpecFunction)
59
+
60
+
61
+ def run_spec_function(name: str) -> Callable[[F], F]:
62
+ """Register the run spec function under the given name."""
63
+
64
+ def wrap(func: F) -> F:
65
+ if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
66
+ raise ValueError(f"A run spec function with name {name} already exists")
67
+ _REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
68
+ return func
69
+
70
+ return wrap
71
+
72
+
73
+ # Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
74
+ def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
75
+ # Specifying the second argument (prefix) to iter_modules makes the
76
+ # returned name an absolute name instead of a relative one. This allows
77
+ # import_module to work without having to do additional modification to
78
+ # the name.
79
+ return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
80
+
81
+
82
+ def discover_run_spec_functions() -> None:
83
+ """Discover and register all run spec functions under helm.benchmark.run_specs"""
84
+ import helm.benchmark.run_specs # noqa
85
+
86
+ for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
87
+ importlib.import_module(name)
88
+
89
+
90
+ def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
91
+ """Return the run spec function registered under the given name."""
92
+ discover_run_spec_functions()
93
+ return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)