crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (499) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
  2. crfm_helm-0.5.1.dist-info/RECORD +654 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  11. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  12. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  13. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  14. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  15. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  16. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  17. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  18. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  19. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  20. helm/benchmark/adaptation/request_state.py +6 -1
  21. helm/benchmark/adaptation/scenario_state.py +6 -2
  22. helm/benchmark/annotation/annotator.py +43 -0
  23. helm/benchmark/annotation/annotator_factory.py +61 -0
  24. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  25. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  26. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  27. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  28. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  29. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  30. helm/benchmark/annotation_executor.py +124 -0
  31. helm/benchmark/augmentations/data_augmenter.py +0 -2
  32. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  33. helm/benchmark/augmentations/perturbation.py +25 -3
  34. helm/benchmark/augmentations/perturbation_description.py +1 -1
  35. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  36. helm/benchmark/augmentations/test_perturbation.py +41 -7
  37. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  38. helm/benchmark/config_registry.py +7 -1
  39. helm/benchmark/executor.py +46 -16
  40. helm/benchmark/huggingface_registration.py +20 -7
  41. helm/benchmark/metrics/basic_metrics.py +169 -664
  42. helm/benchmark/metrics/bbq_metrics.py +3 -4
  43. helm/benchmark/metrics/bias_metrics.py +6 -6
  44. helm/benchmark/metrics/classification_metrics.py +11 -8
  45. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  46. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  47. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  48. helm/benchmark/metrics/common_metric_specs.py +167 -0
  49. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  50. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  51. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  52. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  53. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  54. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  55. helm/benchmark/metrics/efficiency_metrics.py +213 -0
  56. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  57. helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
  58. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  59. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  60. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  61. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  62. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  63. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  64. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  65. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  66. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  68. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  69. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  70. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  71. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  72. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  73. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  74. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  75. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  76. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  77. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  78. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  79. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  80. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  81. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  82. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  83. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  84. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  85. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  86. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  87. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  88. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  89. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  90. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  91. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  92. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  93. helm/benchmark/metrics/metric.py +93 -172
  94. helm/benchmark/metrics/metric_name.py +0 -1
  95. helm/benchmark/metrics/metric_service.py +16 -0
  96. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  97. helm/benchmark/metrics/ranking_metrics.py +2 -2
  98. helm/benchmark/metrics/reference_metric.py +148 -0
  99. helm/benchmark/metrics/summac/model_summac.py +0 -2
  100. helm/benchmark/metrics/summarization_metrics.py +2 -2
  101. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  102. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  103. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  104. helm/benchmark/metrics/test_metric.py +2 -2
  105. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  106. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  107. helm/benchmark/metrics/toxicity_utils.py +23 -0
  108. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  109. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  110. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  111. helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
  112. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  113. helm/benchmark/model_deployment_registry.py +74 -0
  114. helm/benchmark/model_metadata_registry.py +41 -1
  115. helm/benchmark/multi_gpu_runner.py +133 -0
  116. helm/benchmark/presentation/create_plots.py +8 -7
  117. helm/benchmark/presentation/run_display.py +26 -10
  118. helm/benchmark/presentation/schema.py +15 -40
  119. helm/benchmark/presentation/summarize.py +119 -79
  120. helm/benchmark/presentation/table.py +8 -8
  121. helm/benchmark/presentation/test_contamination.py +2 -2
  122. helm/benchmark/presentation/test_run_entry.py +1 -2
  123. helm/benchmark/presentation/test_summarize.py +3 -3
  124. helm/benchmark/run.py +54 -26
  125. helm/benchmark/run_expander.py +205 -35
  126. helm/benchmark/run_spec.py +93 -0
  127. helm/benchmark/run_spec_factory.py +163 -0
  128. helm/benchmark/run_specs/__init__.py +0 -0
  129. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  130. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  131. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  132. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  133. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  134. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  135. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  136. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  137. helm/benchmark/run_specs/vlm_run_specs.py +757 -0
  138. helm/benchmark/runner.py +51 -57
  139. helm/benchmark/runner_config_registry.py +21 -0
  140. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  141. helm/benchmark/scenarios/bold_scenario.py +2 -2
  142. helm/benchmark/scenarios/code_scenario.py +1 -0
  143. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  144. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  145. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  146. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  147. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  148. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  149. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  150. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  151. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  152. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  153. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  154. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  155. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  156. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  157. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  158. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  159. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  160. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  162. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  163. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  164. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  165. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  166. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  167. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  168. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  169. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  170. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  171. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  172. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  173. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  174. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  175. helm/benchmark/scenarios/math_scenario.py +19 -2
  176. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  177. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  178. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  179. helm/benchmark/scenarios/scenario.py +4 -0
  180. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  181. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  182. helm/benchmark/scenarios/test_scenario.py +6 -3
  183. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  184. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  185. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  186. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  187. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  188. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  189. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  190. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  191. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  192. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  193. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
  194. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  198. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  199. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  200. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  201. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  202. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  203. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  204. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  205. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  206. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  207. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  208. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  209. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  210. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  211. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  212. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  213. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  214. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  215. helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
  216. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  217. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  218. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  219. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
  220. helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
  221. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  222. helm/benchmark/server.py +24 -1
  223. helm/benchmark/slurm_runner.py +70 -49
  224. helm/benchmark/static/benchmarking.js +1 -1
  225. helm/benchmark/static/schema_classic.yaml +258 -1066
  226. helm/benchmark/static/schema_image2structure.yaml +304 -0
  227. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  228. helm/benchmark/static/schema_lite.yaml +2 -227
  229. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  230. helm/benchmark/static/schema_unitxt.yaml +428 -0
  231. helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
  232. helm/benchmark/static/schema_vlm.yaml +823 -0
  233. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  234. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  235. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  236. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  237. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  238. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  239. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  240. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  241. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  242. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  243. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  244. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  245. helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
  246. helm/benchmark/static_build/assets/index-878a1094.css +1 -0
  247. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  248. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  249. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  250. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  251. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  252. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  253. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  254. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  255. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  256. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  257. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  258. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  259. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  260. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  261. helm/benchmark/static_build/config.js +4 -0
  262. helm/benchmark/static_build/index.html +20 -0
  263. helm/benchmark/test_data_preprocessor.py +3 -3
  264. helm/benchmark/test_run_expander.py +1 -1
  265. helm/benchmark/window_services/ai21_window_service.py +22 -33
  266. helm/benchmark/window_services/cohere_window_service.py +1 -63
  267. helm/benchmark/window_services/default_window_service.py +2 -44
  268. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  269. helm/benchmark/window_services/ice_window_service.py +0 -34
  270. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  271. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  272. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  273. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  274. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  275. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  276. helm/benchmark/window_services/local_window_service.py +21 -4
  277. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  278. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  279. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  280. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  281. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  282. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  283. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  284. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  285. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  286. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  287. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  288. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  289. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  290. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  291. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  292. helm/benchmark/window_services/test_utils.py +3 -2
  293. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  294. helm/benchmark/window_services/window_service.py +42 -0
  295. helm/benchmark/window_services/window_service_factory.py +4 -1
  296. helm/benchmark/window_services/yalm_window_service.py +0 -27
  297. helm/clients/__init__.py +0 -0
  298. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  299. helm/clients/aleph_alpha_client.py +112 -0
  300. helm/{proxy/clients → clients}/anthropic_client.py +233 -18
  301. helm/{proxy/clients → clients}/auto_client.py +59 -31
  302. helm/clients/bedrock_client.py +128 -0
  303. helm/clients/bedrock_utils.py +72 -0
  304. helm/{proxy/clients → clients}/client.py +65 -7
  305. helm/clients/clip_score_client.py +49 -0
  306. helm/clients/clip_scorers/__init__.py +0 -0
  307. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  308. helm/clients/clip_scorers/clip_scorer.py +50 -0
  309. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  310. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  311. helm/clients/gcs_client.py +82 -0
  312. helm/{proxy/clients → clients}/google_client.py +5 -5
  313. helm/clients/google_translate_client.py +35 -0
  314. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  315. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  316. helm/clients/image_generation/__init__.py +0 -0
  317. helm/clients/image_generation/adobe_vision_client.py +78 -0
  318. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  319. helm/clients/image_generation/cogview2/__init__.py +0 -0
  320. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  321. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  322. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  323. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  324. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  325. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  326. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  327. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  328. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  329. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  330. helm/clients/image_generation/cogview2_client.py +191 -0
  331. helm/clients/image_generation/dalle2_client.py +192 -0
  332. helm/clients/image_generation/dalle3_client.py +108 -0
  333. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  334. helm/clients/image_generation/dalle_mini/data.py +442 -0
  335. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  336. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  337. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  338. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  339. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  340. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  341. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  342. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  343. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  344. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  345. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  346. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  347. helm/clients/image_generation/dalle_mini_client.py +190 -0
  348. helm/clients/image_generation/deep_floyd_client.py +78 -0
  349. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  350. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  351. helm/clients/image_generation/lexica_client.py +86 -0
  352. helm/clients/image_generation/mindalle/__init__.py +0 -0
  353. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  354. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  355. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  356. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  357. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  358. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  359. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  360. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  361. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  362. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  363. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  364. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  365. helm/clients/image_generation/mindalle_client.py +115 -0
  366. helm/clients/image_generation/nudity_check_client.py +64 -0
  367. helm/clients/image_generation/together_image_generation_client.py +111 -0
  368. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  369. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  370. helm/clients/mistral_client.py +134 -0
  371. helm/clients/moderation_api_client.py +109 -0
  372. helm/clients/open_lm_client.py +43 -0
  373. helm/clients/openai_client.py +301 -0
  374. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  375. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  376. helm/clients/simple_client.py +64 -0
  377. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  378. helm/clients/test_client.py +100 -0
  379. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  380. helm/clients/test_simple_client.py +19 -0
  381. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  382. helm/{proxy/clients → clients}/together_client.py +104 -73
  383. helm/clients/vertexai_client.py +400 -0
  384. helm/clients/vision_language/__init__.py +0 -0
  385. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  386. helm/clients/vision_language/huggingface_vlm_client.py +111 -0
  387. helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
  388. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  389. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  390. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  391. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  392. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  393. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  394. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  395. helm/clients/vision_language/open_flamingo_client.py +155 -0
  396. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  397. helm/clients/vllm_client.py +46 -0
  398. helm/common/cache.py +16 -4
  399. helm/common/cache_backend_config.py +47 -0
  400. helm/common/clip_score_request.py +41 -0
  401. helm/common/file_caches/__init__.py +0 -0
  402. helm/common/file_caches/file_cache.py +16 -0
  403. helm/common/file_caches/local_file_cache.py +61 -0
  404. helm/common/file_caches/test_local_file_cache.py +25 -0
  405. helm/common/file_upload_request.py +27 -0
  406. helm/common/general.py +1 -1
  407. helm/common/image_generation_parameters.py +25 -0
  408. helm/common/images_utils.py +33 -3
  409. helm/common/key_value_store.py +35 -4
  410. helm/common/media_object.py +13 -0
  411. helm/common/moderations_api_request.py +71 -0
  412. helm/common/mongo_key_value_store.py +3 -3
  413. helm/common/multimodal_request_utils.py +31 -0
  414. helm/common/nudity_check_request.py +29 -0
  415. helm/common/request.py +15 -17
  416. helm/common/test_general.py +6 -0
  417. helm/common/tokenization_request.py +1 -1
  418. helm/config/model_deployments.yaml +1159 -538
  419. helm/config/model_metadata.yaml +868 -41
  420. helm/config/tokenizer_configs.yaml +149 -43
  421. helm/proxy/accounts.py +31 -4
  422. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  423. helm/proxy/critique/model_critique_client.py +8 -6
  424. helm/proxy/example_queries.py +29 -17
  425. helm/proxy/server.py +70 -5
  426. helm/proxy/services/remote_service.py +31 -0
  427. helm/proxy/services/server_service.py +96 -16
  428. helm/proxy/services/service.py +30 -0
  429. helm/proxy/services/test_remote_service.py +4 -3
  430. helm/proxy/services/test_service.py +0 -12
  431. helm/proxy/test_accounts.py +32 -0
  432. helm/proxy/token_counters/auto_token_counter.py +37 -37
  433. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  434. helm/proxy/token_counters/token_counter.py +3 -5
  435. helm/tokenizers/__init__.py +0 -0
  436. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  437. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  438. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  439. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  440. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  441. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  442. helm/tokenizers/simple_tokenizer.py +33 -0
  443. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  444. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  445. helm/tokenizers/test_simple_tokenizer.py +33 -0
  446. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  447. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  448. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  449. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  450. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  451. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  452. helm/benchmark/run_specs.py +0 -2762
  453. helm/benchmark/test_model_deployment_definition.py +0 -92
  454. helm/benchmark/test_model_properties.py +0 -1570
  455. helm/benchmark/vlm_run_specs.py +0 -97
  456. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  457. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  458. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  459. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  460. helm/benchmark/window_services/t511b_window_service.py +0 -30
  461. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  462. helm/benchmark/window_services/ul2_window_service.py +0 -30
  463. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  464. helm/common/cache_utils.py +0 -14
  465. helm/proxy/clients/aleph_alpha_client.py +0 -95
  466. helm/proxy/clients/goose_ai_client.py +0 -99
  467. helm/proxy/clients/microsoft_client.py +0 -180
  468. helm/proxy/clients/openai_client.py +0 -206
  469. helm/proxy/clients/simple_client.py +0 -60
  470. helm/proxy/clients/test_client.py +0 -49
  471. helm/proxy/clients/vertexai_client.py +0 -115
  472. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  473. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  474. helm/proxy/token_counters/free_token_counter.py +0 -12
  475. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  476. helm/proxy/token_counters/openai_token_counter.py +0 -22
  477. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  478. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  479. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  480. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
  481. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
  482. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
  483. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  484. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  485. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  486. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  487. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  488. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  489. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  490. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  491. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  492. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  493. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  494. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  495. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  496. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  497. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  498. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  499. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -8,19 +8,22 @@ from helm.benchmark.model_metadata_registry import (
8
8
  get_all_code_models,
9
9
  get_all_models,
10
10
  get_all_text_models,
11
+ get_model_metadata,
11
12
  get_model_names_with_tag,
12
13
  FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
13
14
  LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
14
15
  ABLATION_MODEL_TAG,
16
+ TEXT_TO_IMAGE_MODEL_TAG,
15
17
  VISION_LANGUAGE_MODEL_TAG,
18
+ INSTRUCTION_FOLLOWING_MODEL_TAG,
16
19
  )
17
20
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
18
- from helm.common.general import handle_module_not_found_error
19
21
  from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
20
- from .runner import RunSpec
21
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
22
+ from .run_spec import RunSpec
23
+ from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
22
24
  from .augmentations.perturbation import PerturbationSpec
23
25
  from .augmentations.data_augmenter import DataAugmenterSpec
26
+ from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
24
27
 
25
28
 
26
29
  class RunExpander(ABC):
@@ -223,12 +226,16 @@ class AddToStopRunExpander(RunExpander):
223
226
  self.value = value
224
227
 
225
228
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
229
+ if self.value == "newline":
230
+ stop_sequence = "\n"
231
+ else:
232
+ stop_sequence = self.value
226
233
  return [
227
234
  replace(
228
235
  run_spec,
229
236
  name=run_spec.name,
230
237
  adapter_spec=replace(
231
- run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [self.value]
238
+ run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
232
239
  ),
233
240
  ),
234
241
  ]
@@ -273,33 +280,37 @@ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
273
280
  )
274
281
 
275
282
 
276
- class AnthropicRunExpander(RunExpander):
283
+ class AnthropicClaude2RunExpander(RunExpander):
277
284
  """
278
- Custom prompt for Anthropic models.
285
+ Custom prompt for Anthropic Claude 1 and Claude 2 models.
279
286
  These models need more explicit instructions about following the format.
280
287
  """
281
288
 
282
289
  name = "anthropic"
283
290
 
291
+ # These strings must be added to the prompt in order to pass prompt validation,
292
+ # otherwise the Anthropic API will return an error.
293
+ # See: https://docs.anthropic.com/claude/reference/prompt-validation
294
+ HUMAN_PROMPT = "\n\nHuman:"
295
+ AI_PROMPT = "\n\nAssistant:"
296
+
284
297
  def __init__(self):
285
298
  pass
286
299
 
287
300
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
288
- try:
289
- import anthropic
290
- except ModuleNotFoundError as e:
291
- handle_module_not_found_error(e, ["anthropic"])
292
-
293
301
  return [
294
302
  replace(
295
303
  run_spec,
296
304
  name=run_spec.name,
297
305
  adapter_spec=replace(
298
306
  run_spec.adapter_spec,
299
- global_prefix=anthropic.HUMAN_PROMPT + " " + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
307
+ global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
308
+ + " "
309
+ + IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
310
+ + "\n\n",
300
311
  global_suffix="\n\n"
301
312
  + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
302
- + anthropic.AI_PROMPT
313
+ + AnthropicClaude2RunExpander.AI_PROMPT
303
314
  + " "
304
315
  + run_spec.adapter_spec.output_prefix.strip(),
305
316
  ),
@@ -307,21 +318,66 @@ class AnthropicRunExpander(RunExpander):
307
318
  ]
308
319
 
309
320
 
310
- class OpenAIRunExpander(RunExpander):
311
- """
312
- Custom prompt for OpenAI models.
313
- These models need more explicit instructions about following the format.
314
- """
321
+ class AnthropicClaude3RunExpander(RunExpander):
322
+ """Custom prompts for Anthropic Claude 3 models."""
315
323
 
316
- # TODO: Refactor out common logic between this and GoogleRunExpander.
324
+ name = "claude_3"
317
325
 
318
- name = "openai"
326
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
327
+ # Remove all stop sequences that do not contain non-whitespace characters.
328
+ # This prevents the Anthropic API from returnin the following error:
329
+ # "stop_sequences: each stop sequence must contain non-whitespace"
330
+ stop_sequences_with_non_whitespace = [
331
+ stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
332
+ ]
333
+ run_spec = replace(
334
+ run_spec,
335
+ adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
336
+ )
337
+ if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
338
+ instructions = "Answer with only a single letter."
339
+ if run_spec.adapter_spec.instructions:
340
+ instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
341
+ return [
342
+ replace(
343
+ run_spec,
344
+ adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
345
+ ),
346
+ ]
347
+ return [run_spec]
319
348
 
320
- def __init__(self):
321
- pass
349
+
350
+ class FollowFormatInstructionsRunExpander(RunExpander):
351
+ """Adds more explicit instructions about following the format to prompts.
352
+
353
+ The argument controlls which models will receive these instructions.
354
+ If "all", all models receive these instructions.
355
+ If "instruct", only instruction-following models receive these instructions.
356
+
357
+ Only supports the generation adaptation method. Raises an error if used on
358
+ a RunSpec that uses a different adaptation method.
359
+
360
+ Note: For legacy backwards compatibility reasons, despite the use of the word
361
+ "instructions" in this run expander's name, this run expander actually
362
+ modifies the global_prefix and the global_suffix of the AdapterSpec rather than
363
+ the instructions.
364
+ """
365
+
366
+ name = "follow_format_instructions"
367
+
368
+ def __init__(self, value: str):
369
+ if value != "all" and value != "instruct":
370
+ raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
371
+ self.value = value
322
372
 
323
373
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
324
374
  if run_spec.adapter_spec.method != ADAPT_GENERATION:
375
+ raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
376
+
377
+ if (
378
+ self.value == "instruct"
379
+ and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
380
+ ):
325
381
  return [run_spec]
326
382
 
327
383
  return [
@@ -340,31 +396,70 @@ class OpenAIRunExpander(RunExpander):
340
396
  ]
341
397
 
342
398
 
343
- class GoogleRunExpander(RunExpander):
399
+ class IDEFICSInstructRunExpander(RunExpander):
344
400
  """
345
- Custom prompt for Google models.
346
- These models need more explicit instructions about following the format.
401
+ Custom prompt for IDEFICS instruct models which require a specific format.
402
+ See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
347
403
  """
348
404
 
349
- # TODO: Refactor out common logic between this and OpenAIRunExpander.
405
+ name = "idefics_instruct"
406
+
407
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
408
+ return [
409
+ replace(
410
+ run_spec,
411
+ name=run_spec.name,
412
+ adapter_spec=replace(
413
+ run_spec.adapter_spec,
414
+ input_prefix="User: ",
415
+ input_suffix="<end_of_utterance>",
416
+ output_prefix="\nAssistant: ",
417
+ output_suffix="<end_of_utterance>",
418
+ stop_sequences=["<end_of_utterance>"],
419
+ ),
420
+ ),
421
+ ]
350
422
 
351
- name = "google"
423
+
424
+ class LlavaRunExpander(RunExpander):
425
+ """
426
+ Custom prompt for Llava 1.5 models which should use a specific format.
427
+ See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
428
+ """
429
+
430
+ name = "llava"
352
431
 
353
432
  def expand(self, run_spec: RunSpec) -> List[RunSpec]:
354
- if run_spec.adapter_spec.method != ADAPT_GENERATION:
355
- return [run_spec]
433
+ return [
434
+ replace(
435
+ run_spec,
436
+ name=run_spec.name,
437
+ adapter_spec=replace(
438
+ run_spec.adapter_spec,
439
+ input_prefix="USER: <image>",
440
+ input_suffix="",
441
+ output_prefix="\nASSISTANT: ",
442
+ output_suffix="",
443
+ ),
444
+ ),
445
+ ]
446
+
447
+
448
+ class OpenFlamingoRunExpander(RunExpander):
449
+ """
450
+ Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
451
+ """
452
+
453
+ name = "open_flamingo"
356
454
 
455
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
357
456
  return [
358
457
  replace(
359
458
  run_spec,
360
459
  name=run_spec.name,
361
460
  adapter_spec=replace(
362
461
  run_spec.adapter_spec,
363
- global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
364
- global_suffix="\n\n"
365
- + IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
366
- + "\n"
367
- + run_spec.adapter_spec.output_prefix.strip(),
462
+ input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
368
463
  ),
369
464
  ),
370
465
  ]
@@ -415,6 +510,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
415
510
  "one": [1],
416
511
  "all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
417
512
  "big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
513
+ "vhelm": [0, 1, 2, 4, 8],
418
514
  }
419
515
 
420
516
 
@@ -422,7 +518,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
422
518
  """For overriding the number of eval instances at the run level."""
423
519
 
424
520
  name = "max_eval_instances"
425
- values_dict: Dict[str, List[Any]] = {}
521
+ values_dict: Dict[str, List[Any]] = {
522
+ "default": [1_000],
523
+ "heim_default": [100],
524
+ "heim_fid": [30_000],
525
+ "heim_art_styles": [17],
526
+ }
426
527
 
427
528
 
428
529
  class NumOutputsRunExpander(ReplaceValueRunExpander):
@@ -435,6 +536,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
435
536
  }
436
537
 
437
538
 
539
+ class NumTrialRunExpander(ReplaceValueRunExpander):
540
+ """For getting different generations for the same requests."""
541
+
542
+ name = "num_trials"
543
+ values_dict = {
544
+ "heim_efficiency": [5],
545
+ }
546
+
547
+
438
548
  class ModelRunExpander(ReplaceValueRunExpander):
439
549
  """
440
550
  For specifying different models.
@@ -476,6 +586,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
476
586
  "openai/text-davinci-003",
477
587
  ],
478
588
  "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
589
+ "text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
479
590
  "vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
480
591
  }
481
592
 
@@ -500,6 +611,33 @@ class ModelDeploymentRunExpander(ReplaceValueRunExpander):
500
611
  values_dict: Dict[str, List[Any]] = {}
501
612
 
502
613
 
614
+ class EvalSplitRunExpander(RunExpander):
615
+ """Sets the evaluation split.
616
+
617
+ By default, evaluation instances are drawn from both test and validation splits.
618
+ This run expander allows drawing evaluation instances from only the test split or
619
+ only the validation split."""
620
+
621
+ # NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
622
+ # run expander name to be "eval_split", not "eval_splits".
623
+
624
+ name = "eval_split"
625
+
626
+ def __init__(self, value):
627
+ if value != TEST_SPLIT and value != VALID_SPLIT:
628
+ raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
629
+ self.split = value
630
+
631
+ def expand(self, run_spec: RunSpec) -> List[RunSpec]:
632
+ return [
633
+ replace(
634
+ run_spec,
635
+ name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
636
+ adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
637
+ )
638
+ ]
639
+
640
+
503
641
  ############################################################
504
642
 
505
643
 
@@ -688,6 +826,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
688
826
  )
689
827
 
690
828
 
829
+ def translate(language_code: str) -> PerturbationSpec:
830
+ return PerturbationSpec(
831
+ class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
832
+ args={"language_code": language_code},
833
+ )
834
+
835
+
836
+ def suffix(text: str) -> PerturbationSpec:
837
+ return PerturbationSpec(
838
+ class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
839
+ args={"suffix": text},
840
+ )
841
+
842
+
691
843
  # Specifies the data augmentations that we're interested in trying out.
692
844
  # Concretely, this is a mapping from the name (which is specified in a conf
693
845
  # file or the CLI) to a list of options to try, where each option is a list of perturbations.
@@ -879,6 +1031,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
879
1031
  mandarin_to_cantonese(),
880
1032
  ]
881
1033
  },
1034
+ # Multilinguality
1035
+ "chinese": {"chinese": [translate(language_code="zh-CN")]},
1036
+ "hindi": {"hindi": [translate(language_code="hi")]},
1037
+ "spanish": {"spanish": [translate(language_code="es")]},
1038
+ # Styles
1039
+ "art": {
1040
+ "art": [
1041
+ suffix("oil painting"),
1042
+ suffix("watercolor"),
1043
+ suffix("pencil sketch"),
1044
+ suffix("animation"),
1045
+ suffix("vector graphics"),
1046
+ suffix("pixel art"),
1047
+ ]
1048
+ },
882
1049
  }
883
1050
 
884
1051
 
@@ -1219,12 +1386,14 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1219
1386
  NewlineRunExpander,
1220
1387
  StopRunExpander,
1221
1388
  FormatPromptRunExpander,
1389
+ FollowFormatInstructionsRunExpander,
1222
1390
  AddToStopRunExpander,
1223
1391
  GlobalPrefixRunExpander,
1224
1392
  NumTrainTrialsRunExpander,
1225
1393
  MaxTrainInstancesRunExpander,
1226
1394
  MaxEvalInstancesRunExpander,
1227
1395
  NumOutputsRunExpander,
1396
+ NumTrialRunExpander,
1228
1397
  ModelRunExpander,
1229
1398
  ModelDeploymentRunExpander,
1230
1399
  DataAugmentationRunExpander,
@@ -1232,6 +1401,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
1232
1401
  NumPromptTokensRunExpander,
1233
1402
  NumOutputTokensRunExpander,
1234
1403
  ChatMLRunExpander,
1404
+ EvalSplitRunExpander,
1235
1405
  ]
1236
1406
 
1237
1407
 
@@ -0,0 +1,93 @@
1
+ from dataclasses import dataclass, field
2
+ import importlib
3
+ import os
4
+ import pkgutil
5
+ from typing import Callable, Dict, Iterable, List, Optional, TypeVar
6
+
7
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
+ from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
9
+ from helm.benchmark.metrics.metric import MetricSpec
10
+ from helm.benchmark.scenarios.scenario import ScenarioSpec
11
+ from helm.benchmark.annotation.annotator import AnnotatorSpec
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class RunSpec:
16
+ """
17
+ Specifies how to do a single run, which gets a scenario, adapts it, and
18
+ computes a list of stats based on the defined metrics.
19
+ """
20
+
21
+ name: str
22
+ """Unique identifier of the RunSpec"""
23
+
24
+ scenario_spec: ScenarioSpec
25
+ """Which scenario"""
26
+
27
+ adapter_spec: AdapterSpec
28
+ """Specifies how to adapt an instance into a set of requests"""
29
+
30
+ metric_specs: List[MetricSpec]
31
+ """What to evaluate on"""
32
+
33
+ data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
34
+ """Data augmenter. The default `DataAugmenterSpec` does nothing."""
35
+
36
+ groups: List[str] = field(default_factory=list)
37
+ """Groups that this run spec belongs to (for aggregation)"""
38
+
39
+ annotators: Optional[List[AnnotatorSpec]] = None
40
+ """Annotators to use for this run spec"""
41
+
42
+ def __post_init__(self):
43
+ """
44
+ `self.name` is used as the name of the output folder for the `RunSpec`.
45
+ Clean up `self.name` by replacing any "/"'s with "_".
46
+ """
47
+ # TODO: Don't mutate name! clean this up before passing it into the constructor here
48
+ object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
49
+
50
+
51
+ RunSpecFunction = Callable[..., RunSpec]
52
+
53
+
54
+ _REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
55
+ """Dict of run spec function names to run spec functions."""
56
+
57
+
58
+ F = TypeVar("F", bound=RunSpecFunction)
59
+
60
+
61
+ def run_spec_function(name: str) -> Callable[[F], F]:
62
+ """Register the run spec function under the given name."""
63
+
64
+ def wrap(func: F) -> F:
65
+ if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
66
+ raise ValueError(f"A run spec function with name {name} already exists")
67
+ _REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
68
+ return func
69
+
70
+ return wrap
71
+
72
+
73
+ # Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
74
+ def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
75
+ # Specifying the second argument (prefix) to iter_modules makes the
76
+ # returned name an absolute name instead of a relative one. This allows
77
+ # import_module to work without having to do additional modification to
78
+ # the name.
79
+ return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
80
+
81
+
82
+ def discover_run_spec_functions() -> None:
83
+ """Discover and register all run spec functions under helm.benchmark.run_specs"""
84
+ import helm.benchmark.run_specs # noqa
85
+
86
+ for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
87
+ importlib.import_module(name)
88
+
89
+
90
+ def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
91
+ """Return the run spec function registered under the given name."""
92
+ discover_run_spec_functions()
93
+ return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)
@@ -0,0 +1,163 @@
1
+ import dataclasses
2
+ from typing import List
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import (
5
+ ADAPT_GENERATION,
6
+ ADAPT_MULTIPLE_CHOICE_JOINT,
7
+ )
8
+ from helm.benchmark.model_deployment_registry import (
9
+ ModelDeployment,
10
+ get_default_model_deployment_for_model,
11
+ get_model_deployment,
12
+ )
13
+ from helm.benchmark.model_metadata_registry import (
14
+ ANTHROPIC_CLAUDE_1_MODEL_TAG,
15
+ ANTHROPIC_CLAUDE_2_MODEL_TAG,
16
+ ANTHROPIC_CLAUDE_3_MODEL_TAG,
17
+ BUGGY_TEMP_0_TAG,
18
+ CHATML_MODEL_TAG,
19
+ GOOGLE_GEMINI_PRO_VISION_V1_TAG,
20
+ IDEFICS_INSTRUCT_MODEL_TAG,
21
+ LLAVA_MODEL_TAG,
22
+ OPEN_FLAMINGO_MODEL_TAG,
23
+ NLG_PREFIX_TAG,
24
+ NO_NEWLINES_TAG,
25
+ VISION_LANGUAGE_MODEL_TAG,
26
+ IDEFICS_MODEL_TAG,
27
+ ModelMetadata,
28
+ get_model_metadata,
29
+ )
30
+ from helm.benchmark.run_expander import (
31
+ RUN_EXPANDERS,
32
+ AnthropicClaude2RunExpander,
33
+ AnthropicClaude3RunExpander,
34
+ ChatMLRunExpander,
35
+ GlobalPrefixRunExpander,
36
+ IDEFICSInstructRunExpander,
37
+ IncreaseTemperatureRunExpander,
38
+ IncreaseMaxTokensRunExpander,
39
+ LlavaRunExpander,
40
+ OpenFlamingoRunExpander,
41
+ StopRunExpander,
42
+ )
43
+ from helm.benchmark.run_spec import RunSpec, get_run_spec_function
44
+ from helm.common.general import singleton
45
+ from helm.common.object_spec import ObjectSpec
46
+
47
+
48
+ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
49
+ """
50
+ Takes a specification (name, args) and returns a list of `RunSpec`s.
51
+ """
52
+ # Note that we are abusing `spec` a bit because the name is not actually a class name.
53
+ name = spec.class_name
54
+ args = spec.args
55
+
56
+ run_spec_function = get_run_spec_function(name)
57
+ if run_spec_function is None:
58
+ raise ValueError(f"Unknown run spec name: {name}")
59
+
60
+ # Peel off the run expanders (e.g., model)
61
+ expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
62
+ args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
63
+
64
+ run_specs: List[RunSpec] = [run_spec_function(**args)]
65
+
66
+ # Apply expanders
67
+ for expander in expanders:
68
+ run_specs = [
69
+ child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
70
+ ]
71
+
72
+ def alter_run_spec(run_spec: RunSpec) -> RunSpec:
73
+ if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
74
+ raise ValueError("At least one of model_deployment and model must be specified")
75
+ elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
76
+ # Infer model from model deployment
77
+ default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
78
+ if not default_model_name:
79
+ default_model_name = run_spec.adapter_spec.model_deployment
80
+ run_spec = dataclasses.replace(
81
+ run_spec,
82
+ adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
83
+ )
84
+ elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
85
+ # Infer model deployment from model
86
+ default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
87
+ if not default_model_deployment:
88
+ raise ValueError(
89
+ f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
90
+ )
91
+ run_spec = dataclasses.replace(
92
+ run_spec,
93
+ adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
94
+ )
95
+
96
+ # Both model and model_deployment should now be filled
97
+ assert run_spec.adapter_spec.model_deployment
98
+ assert run_spec.adapter_spec.model
99
+
100
+ model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
101
+ deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
102
+ if run_spec.adapter_spec.model != deployment.model_name:
103
+ raise ValueError(
104
+ f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
105
+ f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
106
+ f"for a different model '{deployment.model_name}'"
107
+ )
108
+ # For models that strip newlines, when we're generating, we need to set
109
+ # the delimiter to be '###' so we stop properly.
110
+ if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
111
+ ADAPT_GENERATION,
112
+ ADAPT_MULTIPLE_CHOICE_JOINT,
113
+ ):
114
+ stop_expander = StopRunExpander(value="hash")
115
+ run_spec = singleton(stop_expander.expand(run_spec))
116
+
117
+ if NLG_PREFIX_TAG in model.tags:
118
+ global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
119
+ run_spec = singleton(global_prefix_expander.expand(run_spec))
120
+
121
+ if CHATML_MODEL_TAG in model.tags:
122
+ chatml_expander = ChatMLRunExpander()
123
+ run_spec = singleton(chatml_expander.expand(run_spec))
124
+
125
+ # Anthropic Claude 1 and 2 prompts
126
+ if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
127
+ run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
128
+
129
+ # Anthropic Claude 3
130
+ if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
131
+ run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
132
+
133
+ # Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
134
+ if (
135
+ VISION_LANGUAGE_MODEL_TAG in model.tags
136
+ and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
137
+ and run_spec.adapter_spec.max_tokens == 1
138
+ ):
139
+ run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
140
+
141
+ # IDEFICS special handling
142
+ if IDEFICS_MODEL_TAG in model.tags:
143
+ if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
144
+ run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
145
+
146
+ # Llava
147
+ if LLAVA_MODEL_TAG in model.tags:
148
+ run_spec = singleton(LlavaRunExpander().expand(run_spec))
149
+
150
+ # OpenFlamingo
151
+ if OPEN_FLAMINGO_MODEL_TAG in model.tags:
152
+ run_spec = singleton(OpenFlamingoRunExpander().expand(run_spec))
153
+
154
+ # For multiple choice
155
+ if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
156
+ increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
157
+ run_spec = singleton(increase_temperature_expander.expand(run_spec))
158
+
159
+ return run_spec
160
+
161
+ run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
162
+
163
+ return run_specs
File without changes