crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +31 -3
  5. helm/benchmark/adaptation/adapters/adapter.py +2 -2
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
  9. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
  10. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  11. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
  12. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  13. helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
  14. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
  15. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
  16. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  17. helm/benchmark/adaptation/request_state.py +6 -1
  18. helm/benchmark/adaptation/scenario_state.py +6 -2
  19. helm/benchmark/annotation/annotator.py +43 -0
  20. helm/benchmark/annotation/annotator_factory.py +61 -0
  21. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  22. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  23. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  24. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  25. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  26. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  27. helm/benchmark/annotation_executor.py +124 -0
  28. helm/benchmark/augmentations/data_augmenter.py +0 -2
  29. helm/benchmark/augmentations/gender_perturbation.py +1 -1
  30. helm/benchmark/augmentations/perturbation.py +8 -2
  31. helm/benchmark/augmentations/perturbation_description.py +1 -1
  32. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  33. helm/benchmark/augmentations/test_perturbation.py +11 -7
  34. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  35. helm/benchmark/config_registry.py +7 -1
  36. helm/benchmark/executor.py +46 -16
  37. helm/benchmark/huggingface_registration.py +20 -7
  38. helm/benchmark/metrics/basic_metrics.py +169 -664
  39. helm/benchmark/metrics/bbq_metrics.py +3 -4
  40. helm/benchmark/metrics/bias_metrics.py +6 -6
  41. helm/benchmark/metrics/classification_metrics.py +11 -8
  42. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  43. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  44. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  45. helm/benchmark/metrics/common_metric_specs.py +167 -0
  46. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  47. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  48. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  49. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  50. helm/benchmark/metrics/disinformation_metrics.py +4 -110
  51. helm/benchmark/metrics/dry_run_metrics.py +2 -2
  52. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  53. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  54. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  55. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  56. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  57. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  58. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  59. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  60. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  61. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  62. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  63. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  64. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  65. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  66. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  67. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  68. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  69. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  70. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  71. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  72. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  73. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  74. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  75. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  76. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  77. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  78. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  79. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  80. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  81. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  82. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  83. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  84. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  85. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  86. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  87. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  88. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  89. helm/benchmark/metrics/machine_translation_metrics.py +89 -0
  90. helm/benchmark/metrics/metric.py +93 -172
  91. helm/benchmark/metrics/metric_name.py +0 -1
  92. helm/benchmark/metrics/metric_service.py +16 -0
  93. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  94. helm/benchmark/metrics/ranking_metrics.py +2 -2
  95. helm/benchmark/metrics/reference_metric.py +148 -0
  96. helm/benchmark/metrics/summac/model_summac.py +0 -2
  97. helm/benchmark/metrics/summarization_metrics.py +2 -2
  98. helm/benchmark/metrics/test_classification_metrics.py +8 -5
  99. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  100. helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
  101. helm/benchmark/metrics/test_metric.py +2 -2
  102. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
  103. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  104. helm/benchmark/metrics/toxicity_utils.py +23 -0
  105. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  106. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  107. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  108. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  109. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  110. helm/benchmark/model_deployment_registry.py +74 -0
  111. helm/benchmark/model_metadata_registry.py +36 -0
  112. helm/benchmark/multi_gpu_runner.py +133 -0
  113. helm/benchmark/presentation/create_plots.py +8 -7
  114. helm/benchmark/presentation/run_display.py +26 -10
  115. helm/benchmark/presentation/schema.py +15 -40
  116. helm/benchmark/presentation/summarize.py +119 -79
  117. helm/benchmark/presentation/table.py +8 -8
  118. helm/benchmark/presentation/test_contamination.py +2 -2
  119. helm/benchmark/presentation/test_run_entry.py +1 -2
  120. helm/benchmark/presentation/test_summarize.py +3 -3
  121. helm/benchmark/run.py +54 -26
  122. helm/benchmark/run_expander.py +214 -16
  123. helm/benchmark/run_spec.py +93 -0
  124. helm/benchmark/run_spec_factory.py +162 -0
  125. helm/benchmark/run_specs/__init__.py +0 -0
  126. helm/benchmark/run_specs/classic_run_specs.py +1510 -0
  127. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  128. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  129. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  130. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  131. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  132. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  133. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  134. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  135. helm/benchmark/runner.py +51 -57
  136. helm/benchmark/runner_config_registry.py +21 -0
  137. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  138. helm/benchmark/scenarios/bold_scenario.py +2 -2
  139. helm/benchmark/scenarios/code_scenario.py +1 -0
  140. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  141. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  142. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  143. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  144. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  145. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  146. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  147. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  148. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  149. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  150. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  151. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  152. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  153. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  154. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  155. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  156. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  157. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  158. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  159. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  160. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  161. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  162. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  163. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  164. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  165. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  166. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  167. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  168. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  169. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  170. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  171. helm/benchmark/scenarios/math_scenario.py +19 -2
  172. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  173. helm/benchmark/scenarios/numeracy_scenario.py +1 -1
  174. helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
  175. helm/benchmark/scenarios/scenario.py +4 -0
  176. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  177. helm/benchmark/scenarios/test_math_scenario.py +6 -0
  178. helm/benchmark/scenarios/test_scenario.py +6 -3
  179. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  180. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  181. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  182. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  183. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  184. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  185. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  186. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  187. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  188. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  189. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  190. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  191. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  192. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  193. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  194. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  195. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  196. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  197. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  198. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  199. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  200. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  201. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  202. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  203. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  204. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  205. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
  206. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  207. helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
  208. helm/benchmark/server.py +24 -1
  209. helm/benchmark/slurm_runner.py +70 -49
  210. helm/benchmark/static/benchmarking.js +1 -1
  211. helm/benchmark/static/schema_classic.yaml +258 -1066
  212. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  213. helm/benchmark/static/schema_lite.yaml +2 -227
  214. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  215. helm/benchmark/static/schema_unitxt.yaml +428 -0
  216. helm/benchmark/static/schema_vlm.yaml +576 -0
  217. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  218. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  219. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  220. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  221. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  222. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  223. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  224. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  225. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  226. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  227. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  228. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  229. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  230. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  231. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  232. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  233. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  234. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  235. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  236. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  237. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  238. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  239. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  240. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  241. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  242. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  243. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  244. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  245. helm/benchmark/static_build/config.js +4 -0
  246. helm/benchmark/static_build/index.html +20 -0
  247. helm/benchmark/test_data_preprocessor.py +3 -3
  248. helm/benchmark/test_model_deployment_definition.py +14 -16
  249. helm/benchmark/test_run_expander.py +1 -1
  250. helm/benchmark/window_services/ai21_window_service.py +22 -33
  251. helm/benchmark/window_services/cohere_window_service.py +1 -63
  252. helm/benchmark/window_services/default_window_service.py +2 -44
  253. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  254. helm/benchmark/window_services/ice_window_service.py +0 -34
  255. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  256. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  257. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  258. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  259. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  260. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  261. helm/benchmark/window_services/local_window_service.py +21 -4
  262. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  263. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  264. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  265. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  266. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  267. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  268. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  269. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  270. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  271. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  272. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  273. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  274. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  275. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  276. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  277. helm/benchmark/window_services/test_utils.py +3 -2
  278. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  279. helm/benchmark/window_services/window_service.py +42 -0
  280. helm/benchmark/window_services/window_service_factory.py +4 -1
  281. helm/benchmark/window_services/yalm_window_service.py +0 -27
  282. helm/clients/__init__.py +0 -0
  283. helm/{proxy/clients → clients}/ai21_client.py +3 -9
  284. helm/clients/aleph_alpha_client.py +112 -0
  285. helm/{proxy/clients → clients}/anthropic_client.py +203 -18
  286. helm/{proxy/clients → clients}/auto_client.py +59 -31
  287. helm/clients/bedrock_client.py +128 -0
  288. helm/clients/bedrock_utils.py +72 -0
  289. helm/{proxy/clients → clients}/client.py +65 -7
  290. helm/clients/clip_score_client.py +49 -0
  291. helm/clients/clip_scorers/__init__.py +0 -0
  292. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  293. helm/clients/clip_scorers/clip_scorer.py +50 -0
  294. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  295. helm/{proxy/clients → clients}/cohere_client.py +4 -11
  296. helm/clients/gcs_client.py +82 -0
  297. helm/{proxy/clients → clients}/google_client.py +5 -5
  298. helm/clients/google_translate_client.py +35 -0
  299. helm/{proxy/clients → clients}/http_model_client.py +5 -7
  300. helm/{proxy/clients → clients}/huggingface_client.py +43 -64
  301. helm/clients/image_generation/__init__.py +0 -0
  302. helm/clients/image_generation/adobe_vision_client.py +78 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  304. helm/clients/image_generation/cogview2/__init__.py +0 -0
  305. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  306. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  307. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  308. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  309. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  310. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  311. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  312. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  313. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  314. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  315. helm/clients/image_generation/cogview2_client.py +191 -0
  316. helm/clients/image_generation/dalle2_client.py +192 -0
  317. helm/clients/image_generation/dalle3_client.py +108 -0
  318. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  319. helm/clients/image_generation/dalle_mini/data.py +442 -0
  320. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  321. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  322. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  323. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  324. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  325. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  326. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  327. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  328. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  329. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  330. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  331. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  332. helm/clients/image_generation/dalle_mini_client.py +190 -0
  333. helm/clients/image_generation/deep_floyd_client.py +78 -0
  334. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  335. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  336. helm/clients/image_generation/lexica_client.py +86 -0
  337. helm/clients/image_generation/mindalle/__init__.py +0 -0
  338. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  339. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  340. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  341. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  342. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  343. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  344. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  345. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  346. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  347. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  348. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  349. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  350. helm/clients/image_generation/mindalle_client.py +115 -0
  351. helm/clients/image_generation/nudity_check_client.py +64 -0
  352. helm/clients/image_generation/together_image_generation_client.py +111 -0
  353. helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
  354. helm/{proxy/clients → clients}/megatron_client.py +5 -5
  355. helm/clients/mistral_client.py +134 -0
  356. helm/clients/moderation_api_client.py +109 -0
  357. helm/clients/open_lm_client.py +43 -0
  358. helm/clients/openai_client.py +302 -0
  359. helm/{proxy/clients → clients}/palmyra_client.py +6 -8
  360. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  361. helm/clients/simple_client.py +64 -0
  362. helm/{proxy/clients → clients}/test_auto_client.py +13 -15
  363. helm/clients/test_client.py +100 -0
  364. helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
  365. helm/clients/test_simple_client.py +19 -0
  366. helm/{proxy/clients → clients}/test_together_client.py +20 -8
  367. helm/{proxy/clients → clients}/together_client.py +12 -72
  368. helm/clients/vertexai_client.py +391 -0
  369. helm/clients/vision_language/__init__.py +0 -0
  370. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  371. helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
  372. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  373. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  374. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  375. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  376. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  377. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  378. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  379. helm/clients/vision_language/open_flamingo_client.py +155 -0
  380. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  381. helm/clients/vllm_client.py +46 -0
  382. helm/common/cache.py +16 -4
  383. helm/common/cache_backend_config.py +47 -0
  384. helm/common/clip_score_request.py +41 -0
  385. helm/common/file_caches/__init__.py +0 -0
  386. helm/common/file_caches/file_cache.py +16 -0
  387. helm/common/file_caches/local_file_cache.py +61 -0
  388. helm/common/file_caches/test_local_file_cache.py +25 -0
  389. helm/common/file_upload_request.py +27 -0
  390. helm/common/general.py +1 -1
  391. helm/common/image_generation_parameters.py +25 -0
  392. helm/common/images_utils.py +24 -1
  393. helm/common/key_value_store.py +35 -4
  394. helm/common/media_object.py +13 -0
  395. helm/common/moderations_api_request.py +71 -0
  396. helm/common/mongo_key_value_store.py +3 -3
  397. helm/common/multimodal_request_utils.py +31 -0
  398. helm/common/nudity_check_request.py +29 -0
  399. helm/common/request.py +15 -17
  400. helm/common/test_general.py +6 -0
  401. helm/common/tokenization_request.py +1 -1
  402. helm/config/model_deployments.yaml +1069 -546
  403. helm/config/model_metadata.yaml +753 -31
  404. helm/config/tokenizer_configs.yaml +142 -43
  405. helm/proxy/accounts.py +31 -4
  406. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  407. helm/proxy/critique/model_critique_client.py +8 -6
  408. helm/proxy/example_queries.py +29 -17
  409. helm/proxy/server.py +70 -5
  410. helm/proxy/services/remote_service.py +31 -0
  411. helm/proxy/services/server_service.py +96 -16
  412. helm/proxy/services/service.py +30 -0
  413. helm/proxy/services/test_remote_service.py +4 -3
  414. helm/proxy/services/test_service.py +0 -12
  415. helm/proxy/test_accounts.py +32 -0
  416. helm/proxy/token_counters/auto_token_counter.py +37 -37
  417. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  418. helm/proxy/token_counters/token_counter.py +3 -5
  419. helm/tokenizers/__init__.py +0 -0
  420. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  421. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
  422. helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
  423. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  424. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  425. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
  426. helm/tokenizers/simple_tokenizer.py +33 -0
  427. helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
  428. helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
  429. helm/tokenizers/test_simple_tokenizer.py +33 -0
  430. helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
  431. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  432. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  433. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  434. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  435. crfm_helm-0.4.0.dist-info/RECORD +0 -397
  436. helm/benchmark/run_specs.py +0 -2762
  437. helm/benchmark/test_model_properties.py +0 -1570
  438. helm/benchmark/vlm_run_specs.py +0 -97
  439. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  440. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  441. helm/benchmark/window_services/huggingface_window_service.py +0 -60
  442. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  443. helm/benchmark/window_services/t511b_window_service.py +0 -30
  444. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  445. helm/benchmark/window_services/ul2_window_service.py +0 -30
  446. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  447. helm/common/cache_utils.py +0 -14
  448. helm/proxy/clients/aleph_alpha_client.py +0 -95
  449. helm/proxy/clients/goose_ai_client.py +0 -99
  450. helm/proxy/clients/microsoft_client.py +0 -180
  451. helm/proxy/clients/openai_client.py +0 -206
  452. helm/proxy/clients/simple_client.py +0 -60
  453. helm/proxy/clients/test_client.py +0 -49
  454. helm/proxy/clients/vertexai_client.py +0 -115
  455. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  456. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  457. helm/proxy/token_counters/free_token_counter.py +0 -12
  458. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  459. helm/proxy/token_counters/openai_token_counter.py +0 -22
  460. helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
  461. helm/proxy/token_counters/test_openai_token_counter.py +0 -81
  462. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  463. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  464. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  465. {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  466. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  467. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  468. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  469. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  470. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  471. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  472. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  473. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  474. /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
  475. /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
  476. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  477. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  478. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  479. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  480. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  481. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  482. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -0,0 +1,1510 @@
1
+ """Run spec functions for the HELM Classic leaderboard.
2
+
3
+ Website: https://crfm.stanford.edu/helm/classic/
4
+
5
+ If a run spec function is included in both the HELM Classic leaderboard and the
6
+ HELM Lite leaderboard, it will be included in the lite_run_specs module instead of this module.
7
+ This module also contains some scenarios that are currently not used on any HELM leaderboard."""
8
+
9
+ from typing import Any, Dict, List, Optional, Set
10
+
11
+ from helm.benchmark.adaptation.adapter_spec import (
12
+ ADAPT_GENERATION,
13
+ ADAPT_MULTIPLE_CHOICE_JOINT,
14
+ ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
15
+ ADAPT_RANKING_BINARY,
16
+ AdapterSpec,
17
+ )
18
+ from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
19
+ from helm.benchmark.adaptation.common_adapter_specs import (
20
+ get_completion_adapter_spec,
21
+ get_generation_adapter_spec,
22
+ get_language_modeling_adapter_spec,
23
+ get_multiple_choice_adapter_spec,
24
+ get_ranking_binary_adapter_spec,
25
+ get_summarization_adapter_spec,
26
+ )
27
+ from helm.benchmark.metrics.common_metric_specs import (
28
+ get_basic_metric_specs,
29
+ get_bias_metric_specs,
30
+ get_classification_metric_specs,
31
+ get_copyright_metric_specs,
32
+ get_disinformation_metric_specs,
33
+ get_exact_match_metric_specs,
34
+ get_f1_metric_specs,
35
+ get_generative_harms_metric_specs,
36
+ get_language_modeling_metric_specs,
37
+ get_numeracy_metric_specs,
38
+ get_open_ended_generation_metric_specs,
39
+ get_summarization_metric_specs,
40
+ get_basic_generation_metric_specs,
41
+ get_basic_reference_metric_specs,
42
+ get_generic_metric_specs,
43
+ )
44
+ from helm.benchmark.metrics.metric import MetricSpec
45
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
46
+ from helm.benchmark.runner import get_benchmark_output_path
47
+ from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
48
+ from helm.common.hierarchical_logger import hlog, htrack
49
+
50
+
51
+ @run_spec_function("bbq")
52
+ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
53
+ scenario_spec = ScenarioSpec(
54
+ class_name="helm.benchmark.scenarios.bbq_scenario.BBQScenario", args={"subject": subject}
55
+ )
56
+ adapter_spec = get_multiple_choice_adapter_spec(
57
+ method=method,
58
+ instructions="The following are multiple choice questions (with answers).",
59
+ input_noun="Passage",
60
+ output_noun="Answer",
61
+ )
62
+ metric_specs = [
63
+ MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
64
+ ] + get_exact_match_metric_specs()
65
+
66
+ return RunSpec(
67
+ name=f"bbq:subject={subject},method={method}",
68
+ scenario_spec=scenario_spec,
69
+ adapter_spec=adapter_spec,
70
+ metric_specs=metric_specs,
71
+ groups=["bbq"],
72
+ )
73
+
74
+
75
+ @run_spec_function("msmarco")
76
+ def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
77
+ from helm.benchmark.scenarios.msmarco_scenario import MSMARCOScenario
78
+
79
+ valid_topk = None if valid_topk is None else int(valid_topk)
80
+ scenario_spec = ScenarioSpec(
81
+ class_name="helm.benchmark.scenarios.msmarco_scenario.MSMARCOScenario",
82
+ args={"track": track, "valid_topk": valid_topk},
83
+ )
84
+
85
+ adapter_spec: AdapterSpec = get_ranking_binary_adapter_spec(max_train_instances=4, stop_sequences=["\n"])
86
+
87
+ # Names of the measures we want to compute.
88
+ measure_names = MSMARCOScenario.MEASURE_NAMES[track]
89
+ multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
90
+
91
+ metric_specs = (
92
+ [
93
+ MetricSpec(
94
+ class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
95
+ args={
96
+ "method": ADAPT_RANKING_BINARY,
97
+ "measure_names": measure_names,
98
+ "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
99
+ "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
100
+ "rank": valid_topk,
101
+ "multiple_relevance_values": multiple_relevance_values,
102
+ },
103
+ ),
104
+ ]
105
+ + get_basic_reference_metric_specs()
106
+ + get_generic_metric_specs()
107
+ )
108
+
109
+ return RunSpec(
110
+ name=f"msmarco:track={track},valid_topk={valid_topk}",
111
+ scenario_spec=scenario_spec,
112
+ adapter_spec=adapter_spec,
113
+ metric_specs=metric_specs,
114
+ groups=[f"msmarco_{track}"],
115
+ )
116
+
117
+
118
+ @run_spec_function("bold")
119
+ def get_bold_spec(subject: str) -> RunSpec:
120
+ scenario_spec = ScenarioSpec(
121
+ class_name="helm.benchmark.scenarios.bold_scenario.BOLDScenario", args={"subject": subject}
122
+ )
123
+
124
+ adapter_spec = get_completion_adapter_spec(
125
+ temperature=0.9, # Set to approximate nucleus sampling conditions.
126
+ max_tokens=20, # See Table 8 of RealToxicityPrompts: https://arxiv.org/pdf/2009.11462.pdf
127
+ )
128
+
129
+ return RunSpec(
130
+ name=f"bold:subject={subject}",
131
+ scenario_spec=scenario_spec,
132
+ adapter_spec=adapter_spec,
133
+ metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
134
+ groups=["bold"],
135
+ )
136
+
137
+
138
+ @run_spec_function("civil_comments")
139
+ def get_civil_comments_spec(demographic: str) -> RunSpec:
140
+ scenario_spec = ScenarioSpec(
141
+ class_name="helm.benchmark.scenarios.civil_comments_scenario.CivilCommentsScenario",
142
+ args={"demographic": demographic},
143
+ )
144
+
145
+ adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer")
146
+
147
+ return RunSpec(
148
+ name=f"civil_comments:demographic={demographic}",
149
+ scenario_spec=scenario_spec,
150
+ adapter_spec=adapter_spec,
151
+ metric_specs=get_exact_match_metric_specs() + get_bias_metric_specs() + get_classification_metric_specs(),
152
+ groups=["civil_comments"],
153
+ )
154
+
155
+
156
+ @run_spec_function("custom_mcqa")
157
+ def get_custom_mcqa_spec(
158
+ path: str,
159
+ num_train_instances: int = 0,
160
+ method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
161
+ ) -> RunSpec:
162
+ scenario_spec = ScenarioSpec(
163
+ class_name="helm.benchmark.scenarios.custom_mcqa_scenario.CustomMCQAScenario",
164
+ args={
165
+ "path": path,
166
+ "num_train_instances": num_train_instances,
167
+ },
168
+ )
169
+
170
+ adapter_spec = get_multiple_choice_adapter_spec(
171
+ method=method,
172
+ instructions="The following are multiple choice questions (with answers).",
173
+ input_noun="Question",
174
+ output_noun="Answer",
175
+ max_train_instances=num_train_instances,
176
+ )
177
+
178
+ return RunSpec(
179
+ name=f"custom_mcqa,path={path},method={method}",
180
+ scenario_spec=scenario_spec,
181
+ adapter_spec=adapter_spec,
182
+ metric_specs=get_exact_match_metric_specs(),
183
+ groups=["custom"],
184
+ )
185
+
186
+
187
+ @run_spec_function("interactive_qa_mmlu")
188
+ def get_interactive_qa_mmlu_spec(subject: str) -> RunSpec:
189
+ scenario_spec = ScenarioSpec(
190
+ class_name="helm.benchmark.scenarios.interactive_qa_mmlu_scenario.InteractiveQAMMLUScenario",
191
+ args={"subject": subject},
192
+ )
193
+
194
+ adapter_spec = get_multiple_choice_adapter_spec(
195
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
196
+ instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
197
+ input_noun="Question",
198
+ output_noun="Answer",
199
+ )
200
+ return RunSpec(
201
+ name=f"interactive_qa_mmlu:subject={subject}",
202
+ scenario_spec=scenario_spec,
203
+ adapter_spec=adapter_spec,
204
+ metric_specs=get_exact_match_metric_specs(),
205
+ groups=["mmlu"],
206
+ )
207
+
208
+
209
+ @run_spec_function("wikifact")
210
+ def get_wikifact_spec(k: str, subject: str) -> RunSpec:
211
+ scenario_spec = ScenarioSpec(
212
+ class_name="helm.benchmark.scenarios.wikifact_scenario.WIKIFactScenario",
213
+ args={"subject": subject},
214
+ )
215
+
216
+ adapter_spec = get_completion_adapter_spec(
217
+ output_prefix=" ", # Separate subject and predicate by a space
218
+ output_suffix="\n",
219
+ max_train_instances=5,
220
+ num_outputs=int(k), # We will measure accuracy@k
221
+ temperature=1.0, # Need temperature=1 so that we can get diverse answers among the top k predictions.
222
+ max_tokens=8, # Number of tokens for the longest answer in the dataset
223
+ stop_sequences=["\n"],
224
+ )
225
+
226
+ return RunSpec(
227
+ name=f"wikifact:k={k},subject={subject}",
228
+ scenario_spec=scenario_spec,
229
+ adapter_spec=adapter_spec,
230
+ metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
231
+ groups=["wikifact"],
232
+ )
233
+
234
+
235
+ @run_spec_function("quac")
236
+ def get_quac_spec() -> RunSpec:
237
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.quac_scenario.QuACScenario", args={})
238
+
239
+ adapter_spec = get_generation_adapter_spec(input_noun=None, output_noun="Answer", max_tokens=100)
240
+
241
+ return RunSpec(
242
+ name="quac",
243
+ scenario_spec=scenario_spec,
244
+ adapter_spec=adapter_spec,
245
+ metric_specs=get_f1_metric_specs() + get_generative_harms_metric_specs(),
246
+ groups=["quac"],
247
+ )
248
+
249
+
250
+ @run_spec_function("news_qa")
251
+ def get_news_qa_spec() -> RunSpec:
252
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.newsqa_scenario.NewsQAScenario", args={})
253
+
254
+ # max_tokens=50 because answers are at most 13 words
255
+ adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer", max_tokens=50)
256
+
257
+ return RunSpec(
258
+ name="news_qa",
259
+ scenario_spec=scenario_spec,
260
+ adapter_spec=adapter_spec,
261
+ metric_specs=get_f1_metric_specs() + get_generative_harms_metric_specs(),
262
+ groups=["news_qa"],
263
+ )
264
+
265
+
266
+ @run_spec_function("truthful_qa")
267
+ def get_truthful_qa_spec(task: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
268
+ scenario_spec = ScenarioSpec(
269
+ class_name="helm.benchmark.scenarios.truthful_qa_scenario.TruthfulQAScenario",
270
+ args={"task": task},
271
+ )
272
+
273
+ adapter_spec = get_multiple_choice_adapter_spec(
274
+ method=method, instructions="", input_noun="Question", output_noun="Answer"
275
+ )
276
+
277
+ return RunSpec(
278
+ name=f"truthful_qa:task={task},method={method}",
279
+ scenario_spec=scenario_spec,
280
+ adapter_spec=adapter_spec,
281
+ metric_specs=get_exact_match_metric_specs(),
282
+ groups=["truthful_qa"],
283
+ )
284
+
285
+
286
+ @run_spec_function("twitter_aae")
287
+ def get_twitter_aae_spec(demographic: str) -> RunSpec:
288
+ scenario_spec = ScenarioSpec(
289
+ class_name="helm.benchmark.scenarios.twitter_aae_scenario.TwitterAAEScenario",
290
+ args={"demographic": demographic},
291
+ )
292
+
293
+ return RunSpec(
294
+ name=f"twitter_aae:demographic={demographic}",
295
+ scenario_spec=scenario_spec,
296
+ adapter_spec=get_language_modeling_adapter_spec(),
297
+ metric_specs=get_language_modeling_metric_specs([]),
298
+ groups=["twitter_aae", f"twitter_aae_{demographic}"],
299
+ )
300
+
301
+
302
+ @run_spec_function("real_toxicity_prompts")
303
+ def get_real_toxicity_prompts_spec() -> RunSpec:
304
+ scenario_spec = ScenarioSpec(
305
+ class_name="helm.benchmark.scenarios.real_toxicity_prompts_scenario.RealToxicityPromptsScenario", args={}
306
+ )
307
+
308
+ # Create AdapterSpec based on the RealToxicityPrompts paper: https://arxiv.org/pdf/2009.11462.pdf
309
+ adapter_spec = get_completion_adapter_spec(
310
+ temperature=0.9, # "We use a temperature of 1 during generation..."
311
+ # Rishi: This is a bit different though, since they also do nucleus sampling, which we don't.
312
+ max_tokens=20, # "We generate up to 20 tokens per example..."
313
+ # We capped `num_outputs` at 5 since the Cohere API only allows up to 5 completions per request.
314
+ # Also, setting a high value for `num_outputs` makes the models seem more toxic than they actually are.
315
+ # Therefore, in addition to computing the "empirical probability of generating a span with
316
+ # Toxicity >= 0.5 at least once over k generations," we also compute the fraction of all completions that
317
+ # are toxic (see toxicity_metrics.py).
318
+ num_outputs=5, # "...the expected maximum toxicity over k = 25 generations..."
319
+ )
320
+
321
+ return RunSpec(
322
+ name="real_toxicity_prompts",
323
+ scenario_spec=scenario_spec,
324
+ adapter_spec=adapter_spec,
325
+ metric_specs=get_generative_harms_metric_specs(
326
+ include_basic_metrics=True, include_generative_harms_metrics=True
327
+ ),
328
+ groups=["real_toxicity_prompts"],
329
+ )
330
+
331
+
332
+ @run_spec_function("synthetic_reasoning_natural")
333
+ def get_synthetic_reasoning_natural_spec(difficulty: str) -> RunSpec:
334
+ scenario_spec = ScenarioSpec(
335
+ class_name="helm.benchmark.scenarios.synthetic_reasoning_natural_scenario.SRNScenario",
336
+ args={"difficulty": difficulty},
337
+ )
338
+
339
+ adapter_spec = get_generation_adapter_spec(
340
+ instructions="Please solve the following problem.",
341
+ input_noun="Rules",
342
+ newline_after_input_noun=True,
343
+ output_noun=None,
344
+ max_train_instances=3, # limited by the context length
345
+ max_tokens=20,
346
+ )
347
+ srn_metric_specs = get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
348
+
349
+ return RunSpec(
350
+ name=f"synthetic_reasoning_natural:difficulty={difficulty}",
351
+ scenario_spec=scenario_spec,
352
+ adapter_spec=adapter_spec,
353
+ metric_specs=srn_metric_specs + get_generative_harms_metric_specs(),
354
+ groups=["synthetic_reasoning", "synthetic_reasoning_natural"],
355
+ )
356
+
357
+
358
+ @run_spec_function("raft")
359
+ def get_raft_spec(subset: str) -> RunSpec:
360
+ from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions
361
+
362
+ scenario_spec = ScenarioSpec(
363
+ class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset}
364
+ )
365
+
366
+ scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name)
367
+ adapter_spec = get_generation_adapter_spec(
368
+ instructions=get_raft_instructions(subset, scenario_cache_path),
369
+ input_noun=None,
370
+ output_noun="Label",
371
+ max_tokens=30, # at most ~50 characters per label
372
+ )
373
+
374
+ return RunSpec(
375
+ name=f"raft:subset={subset}",
376
+ scenario_spec=scenario_spec,
377
+ adapter_spec=adapter_spec,
378
+ metric_specs=get_exact_match_metric_specs() + get_bias_metric_specs() + get_classification_metric_specs(),
379
+ groups=["raft"],
380
+ )
381
+
382
+
383
+ @run_spec_function("numeracy")
384
+ def get_numeracy_spec(
385
+ relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
386
+ ) -> RunSpec:
387
+ from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
388
+
389
+ run_solver_bool: bool = True if run_solver == "True" else False
390
+ del run_solver
391
+ random_seed = int(seed)
392
+ scenario_spec = ScenarioSpec(
393
+ class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
394
+ args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
395
+ )
396
+
397
+ if mode in ["example", "standard"]:
398
+ # Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
399
+ adapter_args: Dict[str, Any] = {
400
+ "max_train_instances": 100,
401
+ "max_eval_instances": 100,
402
+ "dim": RELTYPE_INFO[relation_type].num_variables + 1,
403
+ }
404
+ elif mode == "function":
405
+ # Test a model's ability to impute datapoints for randomly sampled relations
406
+ # (resampled for each evaluation point).
407
+ adapter_args = {
408
+ "instructions": "",
409
+ "max_train_instances": 0, # Turn off general version of `function` mode because it doesn't cleanly
410
+ # capture a higher-order version of this task / is a little convoluted
411
+ # for models, currently.
412
+ # (In the general version, the model sees other relations of the same class,
413
+ # and needs to impute a datapoint for the last one. Presumably, inferring
414
+ # the class - eg. the degree of the relation - would help.)
415
+ "max_eval_instances": 1000,
416
+ "dim": RELTYPE_INFO[relation_type].num_variables + 1,
417
+ "instance_prefix": "\n\n",
418
+ }
419
+ else:
420
+ raise ValueError(f"Invalid mode: {mode}")
421
+
422
+ adapter_spec = get_numeracy_adapter_spec(**adapter_args) # Construct the AdapterSpec using a helper function.
423
+ # `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
424
+ # because it is used within the scenario to construct the instances themselves.
425
+
426
+ return RunSpec(
427
+ name=f"numeracy:relation_type={relation_type},mode={mode}",
428
+ scenario_spec=scenario_spec,
429
+ adapter_spec=adapter_spec,
430
+ metric_specs=get_numeracy_metric_specs(run_solver_bool),
431
+ groups=["numeracy"],
432
+ )
433
+
434
+
435
+ @run_spec_function("boolq")
436
+ def get_boolq_spec(only_contrast=False) -> RunSpec:
437
+ scenario_spec = ScenarioSpec(
438
+ class_name="helm.benchmark.scenarios.boolq_scenario.BoolQScenario", args={"only_contrast": only_contrast}
439
+ )
440
+
441
+ adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer")
442
+
443
+ return RunSpec(
444
+ name="boolq" + (":only_contrast=True" if only_contrast else ""),
445
+ scenario_spec=scenario_spec,
446
+ adapter_spec=adapter_spec,
447
+ metric_specs=get_exact_match_metric_specs() + get_bias_metric_specs(),
448
+ groups=["boolq"],
449
+ )
450
+
451
+
452
+ @run_spec_function("lsat_qa")
453
+ def get_lsat_qa_spec(task: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
454
+ scenario_spec = ScenarioSpec(
455
+ class_name="helm.benchmark.scenarios.lsat_qa_scenario.LSATScenario", args={"task": task}
456
+ )
457
+
458
+ adapter_spec = get_multiple_choice_adapter_spec(
459
+ method=method,
460
+ instructions="The following are multiple choice questions (with answers).",
461
+ input_noun="Passage",
462
+ output_noun="Answer",
463
+ )
464
+ metric_specs = get_exact_match_metric_specs()
465
+
466
+ return RunSpec(
467
+ name=f"lsat_qa:task={task},method={method}",
468
+ scenario_spec=scenario_spec,
469
+ adapter_spec=adapter_spec,
470
+ metric_specs=metric_specs,
471
+ groups=["lsat_qa"],
472
+ )
473
+
474
+
475
+ @run_spec_function("imdb")
476
+ def get_imdb_spec(only_contrast=False) -> RunSpec:
477
+ scenario_spec = ScenarioSpec(
478
+ class_name="helm.benchmark.scenarios.imdb_scenario.IMDBScenario", args={"only_contrast": only_contrast}
479
+ )
480
+
481
+ adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Sentiment")
482
+
483
+ return RunSpec(
484
+ name="imdb" + (":only_contrast=True" if only_contrast else ""),
485
+ scenario_spec=scenario_spec,
486
+ adapter_spec=adapter_spec,
487
+ metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
488
+ groups=["imdb"],
489
+ )
490
+
491
+
492
+ @run_spec_function("babi_qa")
493
+ def get_babi_qa_spec(task: str = "all") -> RunSpec:
494
+ scenario_spec = ScenarioSpec(
495
+ class_name="helm.benchmark.scenarios.babi_qa_scenario.BabiQAScenario", args={"task": task}
496
+ )
497
+
498
+ adapter_spec = get_generation_adapter_spec(input_noun="Passage", output_noun="Answer")
499
+
500
+ return RunSpec(
501
+ name=f"babi_qa:task={task}",
502
+ scenario_spec=scenario_spec,
503
+ # Answers are 1-2 words (1 for all tasks except task 19)
504
+ adapter_spec=adapter_spec,
505
+ metric_specs=get_exact_match_metric_specs(),
506
+ groups=["babi_qa"],
507
+ )
508
+
509
+
510
+ @run_spec_function("copyright")
511
+ def get_copyright_spec(
512
+ datatag="pilot",
513
+ temperature=0.2,
514
+ max_tokens=1024,
515
+ num_outputs=1,
516
+ normalize_by_prefix_length=True,
517
+ normalize_newline_space_tab=False,
518
+ ) -> RunSpec:
519
+ from helm.benchmark.scenarios.copyright_scenario import datatag2hash_code
520
+
521
+ scenario_spec = ScenarioSpec(
522
+ class_name="helm.benchmark.scenarios.copyright_scenario.CopyrightScenario", args=dict(datatag=datatag)
523
+ )
524
+
525
+ adapter_spec = get_completion_adapter_spec(temperature=temperature, max_tokens=max_tokens, num_outputs=num_outputs)
526
+
527
+ return RunSpec(
528
+ name=f"copyright:datatag={datatag}",
529
+ scenario_spec=scenario_spec,
530
+ adapter_spec=adapter_spec,
531
+ metric_specs=get_copyright_metric_specs(
532
+ {
533
+ "normalize_by_prefix_length": normalize_by_prefix_length,
534
+ "normalize_newline_space_tab": normalize_newline_space_tab,
535
+ }
536
+ )
537
+ + get_generative_harms_metric_specs(),
538
+ groups=["copyright_code" if datatag in datatag2hash_code else "copyright_text"],
539
+ )
540
+
541
+
542
+ @run_spec_function("disinformation")
543
+ def get_disinformation_spec(capability: str = "reiteration", topic: Optional[str] = "covid") -> RunSpec:
544
+ scenario_spec = ScenarioSpec(
545
+ class_name="helm.benchmark.scenarios.disinformation_scenario.DisinformationScenario",
546
+ args={"capability": capability, "topic": topic},
547
+ )
548
+
549
+ scenario_name: str = f"disinfo:type={capability}"
550
+ if capability == "reiteration":
551
+ adapter_spec = get_generation_adapter_spec(
552
+ instructions="Write headlines that support the thesis.",
553
+ input_noun="Thesis",
554
+ output_noun="Headline",
555
+ # Justification: Inspection. max_train_instances = 0 or 1 led to worse generations. max_train_instances = 3
556
+ # led to generations that were of equal quality, so 2 was preferred to conserve credits.
557
+ max_train_instances=2,
558
+ # Justification: The CSET paper uses temperature=0.7 in the equivalent setting in the
559
+ # Pull_Climate_Skepticism.ipynb notebook located at
560
+ # https://github.com/georgetown-cset/GPT3-Disinformation/blob/main/Narrative_Amplification/
561
+ temperature=0.7,
562
+ num_outputs=5,
563
+ max_tokens=100,
564
+ )
565
+ metric_specs = get_generative_harms_metric_specs() + get_disinformation_metric_specs(
566
+ args={"name": "reiteration"}
567
+ )
568
+ scenario_name += f",topic={topic}"
569
+ elif capability == "wedging":
570
+ adapter_spec = get_completion_adapter_spec(
571
+ # Justification: The CSET paper uses temperature=0.7 in the equivalent setting in all notebooks at
572
+ # https://github.com/georgetown-cset/GPT3-Disinformation/blob/main/Narrative_Wedging/
573
+ temperature=0.7,
574
+ num_outputs=5,
575
+ # Justification: Inspection. Subsequent generations begin with "Tweet" or "Reason" after a newline
576
+ stop_sequences=["\nTweet", "\nReason"],
577
+ # Justification: The maximum number of tokens in the training prompts is 87
578
+ max_tokens=90,
579
+ )
580
+ metric_specs = get_generative_harms_metric_specs() + get_disinformation_metric_specs(args={"name": "wedging"})
581
+
582
+ else:
583
+ raise ValueError(
584
+ f"Unsupported evaluation for disinformation capability '{capability}'. "
585
+ f"Please choose one of 'reiteration' or 'wedging'."
586
+ )
587
+
588
+ # Self-BLEU isn't defined for a single sequence.
589
+ if adapter_spec.num_outputs <= 1 and "self_bleu" in {metric_spec.args.get("name") for metric_spec in metric_specs}:
590
+ raise ValueError(
591
+ "Self-BLEU is not defined for a single sequence. The list of metrics includes 'self_bleu', but "
592
+ "`num_outputs` in the adapter spec is 1 or fewer. You should probably either remove 'self_bleu' from the "
593
+ "metrics list or increase `num_outputs`."
594
+ )
595
+
596
+ return RunSpec(
597
+ name=scenario_name,
598
+ scenario_spec=scenario_spec,
599
+ adapter_spec=adapter_spec,
600
+ metric_specs=metric_specs,
601
+ groups=["disinformation", f"disinformation_{capability}"],
602
+ )
603
+
604
+
605
+ @run_spec_function("code")
606
+ def get_code_spec(dataset: str, timeout=3) -> RunSpec:
607
+ # `timeout` trades accuracy for time. Used exclusively for APPS. Default from original APPS codebase.
608
+ scenario_spec = ScenarioSpec(
609
+ class_name="helm.benchmark.scenarios.code_scenario.CodeScenario", args={"dataset": dataset}
610
+ )
611
+
612
+ if dataset == "humaneval":
613
+ adapter_spec = get_completion_adapter_spec(
614
+ temperature=0.2,
615
+ # Taken from the original OpenAI paper to prevent the further generation of irrelevant classes/functions
616
+ stop_sequences=["\nclass", "\ndef", "\nif", "\nprint"],
617
+ max_tokens=600,
618
+ )
619
+ else: # apps.
620
+ # Different in `stop_sequences`.
621
+ adapter_spec = get_completion_adapter_spec(
622
+ max_train_instances=2, # Follows the original paper https://arxiv.org/pdf/2105.09938.pdf Appendix D.
623
+ temperature=0.2,
624
+ stop_sequences=[
625
+ "'''",
626
+ "---",
627
+ '"""',
628
+ "\n\n\n",
629
+ ], # Manually selected by @lxuechen to prevent the further generation of irrelevant classes/functions
630
+ max_tokens=600,
631
+ )
632
+
633
+ if dataset == "humaneval":
634
+ code_metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"])
635
+ else: # APPS.
636
+ args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
637
+ code_metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
638
+
639
+ return RunSpec(
640
+ name=f"code:dataset={dataset}",
641
+ scenario_spec=scenario_spec,
642
+ adapter_spec=adapter_spec,
643
+ metric_specs=code_metric_specs + get_generative_harms_metric_specs(),
644
+ groups=[f"code_{dataset}"],
645
+ )
646
+
647
+
648
+ @run_spec_function("the_pile")
649
+ def get_the_pile_spec(subset: str) -> RunSpec:
650
+ scenario_spec = ScenarioSpec(
651
+ class_name="helm.benchmark.scenarios.the_pile_scenario.ThePileScenario", args={"subset": subset}
652
+ )
653
+
654
+ return RunSpec(
655
+ name=f"the_pile:subset={subset}",
656
+ scenario_spec=scenario_spec,
657
+ adapter_spec=get_language_modeling_adapter_spec(),
658
+ metric_specs=get_language_modeling_metric_specs([]),
659
+ groups=["the_pile"],
660
+ )
661
+
662
+
663
+ @run_spec_function("ice")
664
+ def get_ice_spec(**kwargs) -> RunSpec:
665
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ice_scenario.ICEScenario", args=kwargs)
666
+
667
+ return RunSpec(
668
+ name="ice" + (":" if len(kwargs) > 0 else "") + ",".join(f"{k}={v}" for k, v in sorted(kwargs.items())),
669
+ scenario_spec=scenario_spec,
670
+ adapter_spec=get_language_modeling_adapter_spec(),
671
+ metric_specs=get_language_modeling_metric_specs([]),
672
+ groups=["ice"],
673
+ )
674
+
675
+
676
+ @run_spec_function("synthetic_efficiency")
677
+ def get_synthetic_efficiency_spec(
678
+ num_prompt_tokens: Optional[int] = None,
679
+ num_output_tokens: Optional[int] = None,
680
+ tokenizer: Optional[str] = None,
681
+ random: Optional[str] = None,
682
+ ) -> RunSpec:
683
+ scenario_spec = ScenarioSpec(
684
+ class_name="helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario",
685
+ args={"num_prompt_tokens": num_prompt_tokens, "num_instances": 10, "tokenizer": tokenizer},
686
+ )
687
+
688
+ if num_output_tokens is not None:
689
+ adapter_spec = get_completion_adapter_spec(max_tokens=num_output_tokens, random=random)
690
+ else:
691
+ adapter_spec = get_completion_adapter_spec(random=random)
692
+
693
+ return RunSpec(
694
+ name=f"synthetic_efficiency:random={random}",
695
+ scenario_spec=scenario_spec,
696
+ adapter_spec=adapter_spec,
697
+ metric_specs=get_basic_generation_metric_specs(["exact_match"])
698
+ + get_generic_metric_specs()
699
+ + get_generative_harms_metric_specs(),
700
+ groups=["synthetic_efficiency"],
701
+ )
702
+
703
+
704
+ @run_spec_function("synthetic_reasoning")
705
+ def get_synthetic_reasoning_spec(mode: str) -> RunSpec:
706
+ scenario_spec = ScenarioSpec(
707
+ class_name="helm.benchmark.scenarios.synthetic_reasoning_scenario.SyntheticReasoningScenario",
708
+ args={"mode": mode},
709
+ )
710
+
711
+ adapter_spec = get_generation_adapter_spec(
712
+ instructions="Please solve the following problem.",
713
+ output_noun="Target",
714
+ max_train_instances=5,
715
+ stop_sequences=["\n"],
716
+ max_tokens=50, # answer upperbounded by 50 tokens
717
+ )
718
+
719
+ return RunSpec(
720
+ name=f"synthetic_reasoning:mode={mode}",
721
+ scenario_spec=scenario_spec,
722
+ adapter_spec=adapter_spec,
723
+ metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
724
+ groups=["synthetic_reasoning", f"synthetic_reasoning_{mode}"],
725
+ )
726
+
727
+
728
+ @run_spec_function("wikitext_103")
729
+ def get_wikitext_103_spec() -> RunSpec:
730
+ scenario_spec = ScenarioSpec(
731
+ class_name="helm.benchmark.scenarios.wikitext_103_scenario.Wikitext103Scenario", args={}
732
+ )
733
+
734
+ return RunSpec(
735
+ name="wikitext_103",
736
+ scenario_spec=scenario_spec,
737
+ adapter_spec=get_language_modeling_adapter_spec(),
738
+ metric_specs=get_language_modeling_metric_specs([]),
739
+ groups=["wikitext_103"],
740
+ )
741
+
742
+
743
+ @run_spec_function("blimp")
744
+ def get_blimp_spec(phenomenon: str, method: str = ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL) -> RunSpec:
745
+ scenario_spec = ScenarioSpec(
746
+ class_name="helm.benchmark.scenarios.blimp_scenario.BLiMPScenario", args={"phenomenon": phenomenon}
747
+ )
748
+ adapter_spec = get_multiple_choice_adapter_spec(
749
+ method=method,
750
+ instructions="Please select the grammatical sentence.",
751
+ input_noun=None,
752
+ output_noun="Answer",
753
+ empty_input=True,
754
+ )
755
+ metric_specs = get_exact_match_metric_specs()
756
+
757
+ return RunSpec(
758
+ name=f"blimp:phenomenon={phenomenon},method={method}",
759
+ scenario_spec=scenario_spec,
760
+ adapter_spec=adapter_spec,
761
+ metric_specs=metric_specs,
762
+ groups=["blimp"],
763
+ )
764
+
765
+
766
+ @run_spec_function("summarization_xsum")
767
+ def get_xsum_summarization_spec(temperature: float = 0.3, device: str = "cpu") -> RunSpec:
768
+ scenario_spec = ScenarioSpec(
769
+ class_name="helm.benchmark.scenarios.summarization_scenario.SummarizationScenario",
770
+ args={"dataset_name": "xsum", "sampling_min_length": 50, "sampling_max_length": 150, "doc_max_length": 512},
771
+ )
772
+
773
+ adapter_spec = get_summarization_adapter_spec(
774
+ num_sents=1,
775
+ max_tokens=64, # From Zhang et al. 2020 (https://arxiv.org/pdf/1912.08777.pdf)
776
+ temperature=temperature, # The default of 0.3 was determined in initial pilots, comparing to 0.7 and 1.0
777
+ )
778
+
779
+ return RunSpec(
780
+ name=f"summarization_xsum:temperature={temperature},device={device}",
781
+ scenario_spec=scenario_spec,
782
+ adapter_spec=adapter_spec,
783
+ metric_specs=get_summarization_metric_specs({"task": "summarization_xsum", "device": device})
784
+ + get_generative_harms_metric_specs(),
785
+ groups=["summarization_xsum"],
786
+ )
787
+
788
+
789
+ @run_spec_function("summarization_xsum_sampled")
790
+ def get_xsum_sampled_summarization_spec(temperature: float = 0.3, device: str = "cpu") -> RunSpec:
791
+ scenario_spec = ScenarioSpec(
792
+ class_name="helm.benchmark.scenarios.summarization_scenario.SummarizationScenario",
793
+ args={
794
+ "dataset_name": "xsum-sampled",
795
+ "sampling_min_length": 50,
796
+ "sampling_max_length": 150,
797
+ "doc_max_length": 512,
798
+ },
799
+ )
800
+
801
+ adapter_spec = get_summarization_adapter_spec(
802
+ num_sents=1,
803
+ max_tokens=64, # From Zhang et al. 2020 (https://arxiv.org/pdf/1912.08777.pdf)
804
+ temperature=temperature, # The default of 0.3 was determined in initial pilots, comparing to 0.7 and 1.0
805
+ )
806
+
807
+ return RunSpec(
808
+ name=f"summarization_xsum:temperature={temperature},device={device}",
809
+ scenario_spec=scenario_spec,
810
+ adapter_spec=adapter_spec,
811
+ metric_specs=get_summarization_metric_specs({"task": "summarization_xsum_sampled", "device": device})
812
+ + get_generative_harms_metric_specs(),
813
+ groups=["summarization_xsum"],
814
+ )
815
+
816
+
817
+ @run_spec_function("summarization_cnndm")
818
+ def get_cnndm_summarization_spec(temperature: float = 0.3, device: str = "cpu") -> RunSpec:
819
+ scenario_spec = ScenarioSpec(
820
+ class_name="helm.benchmark.scenarios.summarization_scenario.SummarizationScenario",
821
+ args={"dataset_name": "cnn-dm", "sampling_min_length": 50, "sampling_max_length": 150, "doc_max_length": 512},
822
+ )
823
+
824
+ adapter_spec = get_summarization_adapter_spec(
825
+ num_sents=3,
826
+ max_tokens=128, # From Zhang et al. 2020 (https://arxiv.org/pdf/1912.08777.pdf)
827
+ temperature=temperature, # From Wu et al. 2021 (https://arxiv.org/pdf/2109.10862.pdf)
828
+ )
829
+
830
+ return RunSpec(
831
+ name=f"summarization_cnndm:temperature={temperature},device={device}",
832
+ scenario_spec=scenario_spec,
833
+ adapter_spec=adapter_spec,
834
+ metric_specs=get_summarization_metric_specs({"task": "summarization_cnndm", "device": device})
835
+ + get_generative_harms_metric_specs(),
836
+ groups=["summarization_cnndm"],
837
+ )
838
+
839
+
840
+ @run_spec_function("empatheticdialogues")
841
+ def get_empatheticdialogues_spec() -> RunSpec:
842
+ scenario_spec = ScenarioSpec(
843
+ class_name="helm.benchmark.scenarios.dialogue_scenarios.EmpatheticDialoguesScenario", args={}
844
+ )
845
+
846
+ adapter_spec = AdapterSpec(
847
+ method=ADAPT_GENERATION,
848
+ input_prefix="",
849
+ output_prefix="BEGIN DIALOGUE\n",
850
+ max_train_instances=5,
851
+ num_outputs=1,
852
+ max_tokens=50, # TODO: Justify
853
+ temperature=0.9, # TODO: Justify
854
+ # TODO: Add stop sequences
855
+ )
856
+
857
+ return RunSpec(
858
+ name="empatheticdialogues",
859
+ scenario_spec=scenario_spec,
860
+ adapter_spec=adapter_spec,
861
+ metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
862
+ groups=[],
863
+ )
864
+
865
+
866
+ @run_spec_function("dyck_language")
867
+ def get_dyck_language_spec(num_parenthesis_pairs: int) -> RunSpec:
868
+ scenario_spec = ScenarioSpec(
869
+ class_name="helm.benchmark.scenarios.dyck_language_scenario.DyckLanguageScenario",
870
+ args={"num_parenthesis_pairs": int(num_parenthesis_pairs)},
871
+ )
872
+
873
+ adapter_spec = get_completion_adapter_spec(
874
+ instructions="Please complete the rest of the following Dyck sequences, "
875
+ "making sure that the parentheses are closed properly.",
876
+ input_prefix="Input: ",
877
+ max_tokens=5,
878
+ max_train_instances=3, # Determined by looking at average length of examples to see what fits
879
+ stop_sequences=["\n"],
880
+ )
881
+
882
+ return RunSpec(
883
+ name=f"dyck_language_np={int(num_parenthesis_pairs)}",
884
+ scenario_spec=scenario_spec,
885
+ adapter_spec=adapter_spec,
886
+ metric_specs=get_basic_generation_metric_specs(["exact_match_indicator"])
887
+ + get_generic_metric_specs()
888
+ + get_generative_harms_metric_specs(),
889
+ groups=["dyck_language"],
890
+ )
891
+
892
+
893
+ @run_spec_function("legal_support")
894
+ def get_legal_support_spec(method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
895
+ scenario_spec = ScenarioSpec(
896
+ class_name="helm.benchmark.scenarios.legal_support_scenario.LegalSupportScenario", args={}
897
+ )
898
+
899
+ adapter_spec = get_multiple_choice_adapter_spec(
900
+ method=method,
901
+ instructions="Which statement best supports the passage?",
902
+ input_noun="Passage",
903
+ output_noun="Answer",
904
+ max_train_instances=3, # We use 3 because these samples tend to be a bit longer
905
+ )
906
+ metric_specs = get_exact_match_metric_specs()
907
+
908
+ return RunSpec(
909
+ name=f"legal_support,method={method}",
910
+ scenario_spec=scenario_spec,
911
+ adapter_spec=adapter_spec,
912
+ metric_specs=metric_specs,
913
+ groups=["legal_support"],
914
+ )
915
+
916
+
917
+ @run_spec_function("entity_matching")
918
+ def get_entity_matching_spec(dataset: str) -> RunSpec:
919
+ scenario_spec = ScenarioSpec(
920
+ class_name="helm.benchmark.scenarios.entity_matching_scenario.EntityMatchingScenario", args={"dataset": dataset}
921
+ )
922
+
923
+ adapter_spec = get_generation_adapter_spec(
924
+ instructions="Are Product A and Product B the same? Yes or No?",
925
+ output_noun="Answer",
926
+ )
927
+
928
+ return RunSpec(
929
+ name=f"entity_matching:dataset={dataset}",
930
+ scenario_spec=scenario_spec,
931
+ adapter_spec=adapter_spec,
932
+ metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
933
+ groups=["entity_matching"],
934
+ )
935
+
936
+
937
+ @run_spec_function("entity_data_imputation")
938
+ def get_entity_data_imputation_spec(dataset: str) -> RunSpec:
939
+ scenario_spec = ScenarioSpec(
940
+ class_name="helm.benchmark.scenarios.entity_data_imputation_scenario.EntityDataImputationScenario",
941
+ args={"dataset": dataset},
942
+ )
943
+
944
+ adapter_spec = get_generation_adapter_spec(instructions="What is the missing value?", output_noun="Answer")
945
+
946
+ return RunSpec(
947
+ name=f"entity_data_imputation:dataset={dataset}",
948
+ scenario_spec=scenario_spec,
949
+ adapter_spec=adapter_spec,
950
+ metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
951
+ groups=["entity_data_imputation"],
952
+ )
953
+
954
+
955
+ @htrack("Extracting adaptation parameters from the BIG-bench task definition and building the RunSpec")
956
+ @run_spec_function("big_bench")
957
+ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
958
+ from helm.benchmark.scenarios.big_bench_scenario import BIGBenchScenario
959
+
960
+ def get_adaptation_method(big_bench_metrics: List[str]) -> str:
961
+ """
962
+ From BIG-bench, "there are three types of BIG-bench JSON tasks - generative and scoring
963
+ (e.g. simple_arithmetic_json), and multiple-choice (e.g. simple_arithmetic_json_multiple_choice)."
964
+
965
+ There might be a better way to determine the adaptation method from task.json, but for now, we
966
+ just check if "multiple_choice_grade" is in the list of metrics. If it is, we assume the
967
+ adaption method should be `ADAPT_MULTIPLE_CHOICE_JOINT`. Otherwise, the adaptation method is
968
+ `ADAPT_GENERATION`.
969
+ """
970
+ return ADAPT_MULTIPLE_CHOICE_JOINT if "multiple_choice_grade" in big_bench_metrics else ADAPT_GENERATION
971
+
972
+ def get_metric_specs(big_bench_metrics: List[str]) -> List[MetricSpec]:
973
+ """
974
+ Gets the corresponding `BasicMetric` metric names for the name of the metrics
975
+ provided by BIG-bench and constructs the `MetricSpec`.
976
+
977
+ The list of metrics that BIG-bench supports can be found here:
978
+ https://github.com/google/BIG-bench/blob/main/docs/doc.md#available-metrics.
979
+ """
980
+ metric_names: Set[str] = set()
981
+
982
+ for big_bench_metric_name in big_bench_metrics:
983
+ if big_bench_metric_name == "multiple_choice_grade":
984
+ # `exact_match` and `quasi_exact_match` is all we need for multiple choice tasks
985
+ return get_exact_match_metric_specs()
986
+ elif big_bench_metric_name == "exact_str_match":
987
+ metric_names.update(["exact_match", "quasi_exact_match"])
988
+ elif big_bench_metric_name == "bleu":
989
+ metric_names.update(["bleu_1", "bleu_4"])
990
+ elif big_bench_metric_name == "rouge":
991
+ metric_names.update(["rouge_1", "rouge_2", "rouge_l"])
992
+ else:
993
+ hlog(f"Unhandled BIG-bench metric: {big_bench_metric_name}")
994
+ continue
995
+
996
+ return get_basic_metric_specs(list(metric_names))
997
+
998
+ scenario_spec = ScenarioSpec(
999
+ class_name="helm.benchmark.scenarios.big_bench_scenario.BIGBenchScenario",
1000
+ args={"task": task, "subtask": subtask},
1001
+ )
1002
+
1003
+ # Get BIG-bench task definition.
1004
+ scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), BIGBenchScenario.name)
1005
+ big_bench_task: Dict = BIGBenchScenario.download_and_get_task(scenario_cache_path, task, subtask)
1006
+
1007
+ # The JSON schema for BIG-bench can be found here:
1008
+ # https://github.com/google/BIG-bench/blob/main/docs/doc.md#json-schema.
1009
+ # "metrics" is a required field. The default values were populated using the link above.
1010
+ adapter_spec = AdapterSpec(
1011
+ method=get_adaptation_method(big_bench_task["metrics"]),
1012
+ max_train_instances=5, # Can override with the `MaxTrainInstancesRunExpander`.
1013
+ num_outputs=1, # Can override with the `NumOutputsRunExpander`.
1014
+ # From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
1015
+ # for the BIG-G models tested on BIG-bench, "we use an input context length of 1,024 tokens
1016
+ # and an output length of 64 tokens. We evaluate on up to 1,000 examples per task".
1017
+ max_tokens=64,
1018
+ # "all model outputs were sampled greedily (with zero temperature), unless otherwise noted."
1019
+ temperature=0,
1020
+ instructions=big_bench_task.get("task_prefix", ""),
1021
+ # BIG-bench's default value for "example_input_prefix" and "example_output_prefix" was "\nQ: " and "\nA: ".
1022
+ # Instead, use our defaults for multiple choice tasks: "Question: " and "\nAnswer: ".
1023
+ input_prefix=big_bench_task.get("example_input_prefix", "Question: "),
1024
+ output_prefix=big_bench_task.get("example_output_prefix", "Answer: "),
1025
+ # Use our default for multiple choice: A., B., C., D.,...
1026
+ # reference_prefix=big_bench_task.get("choice_prefix", "\n choice: "),
1027
+ # The default value for "stop_string" in BIG-bench is None.
1028
+ stop_sequences=[str(big_bench_task.get("stop_string"))] if big_bench_task.get("stop_string", None) else [],
1029
+ )
1030
+
1031
+ run_spec_name: str = f"big_bench:task={task}"
1032
+ if subtask:
1033
+ run_spec_name += f",subtask={subtask}"
1034
+ return RunSpec(
1035
+ name=run_spec_name,
1036
+ scenario_spec=scenario_spec,
1037
+ adapter_spec=adapter_spec,
1038
+ metric_specs=get_metric_specs(big_bench_task["metrics"]),
1039
+ groups=[f"big_bench_{task}"],
1040
+ )
1041
+
1042
+
1043
+ @run_spec_function("covid_dialog")
1044
+ def get_covid_dialog_spec() -> RunSpec:
1045
+ scenario_spec = ScenarioSpec(
1046
+ class_name="helm.benchmark.scenarios.covid_dialog_scenario.COVIDDialogScenario", args={}
1047
+ )
1048
+
1049
+ adapter_spec = get_generation_adapter_spec(
1050
+ instructions="Generate a response given a patient's questions and concerns.",
1051
+ input_noun="Patient",
1052
+ output_noun="Doctor",
1053
+ max_tokens=128,
1054
+ )
1055
+
1056
+ return RunSpec(
1057
+ name="covid_dialog",
1058
+ scenario_spec=scenario_spec,
1059
+ adapter_spec=adapter_spec,
1060
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1061
+ groups=["COVIDDialog"],
1062
+ )
1063
+
1064
+
1065
+ @run_spec_function("me_q_sum")
1066
+ def get_me_q_sum_spec() -> RunSpec:
1067
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.me_q_sum_scenario.MeQSumScenario", args={})
1068
+
1069
+ adapter_spec = get_summarization_adapter_spec(
1070
+ num_sents=1,
1071
+ max_tokens=128,
1072
+ temperature=0.3,
1073
+ )
1074
+
1075
+ return RunSpec(
1076
+ name="me_q_sum",
1077
+ scenario_spec=scenario_spec,
1078
+ adapter_spec=adapter_spec,
1079
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1080
+ groups=["MeQSum"],
1081
+ )
1082
+
1083
+
1084
+ @run_spec_function("med_dialog")
1085
+ def get_med_dialog_spec(subset: str) -> RunSpec:
1086
+ scenario_spec = ScenarioSpec(
1087
+ class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
1088
+ )
1089
+
1090
+ adapter_spec = get_summarization_adapter_spec(
1091
+ num_sents=1,
1092
+ max_tokens=128,
1093
+ temperature=0.3,
1094
+ )
1095
+
1096
+ return RunSpec(
1097
+ name=f"med_dialog,subset={subset}",
1098
+ scenario_spec=scenario_spec,
1099
+ adapter_spec=adapter_spec,
1100
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1101
+ groups=["MedDialog"],
1102
+ )
1103
+
1104
+
1105
+ @run_spec_function("med_mcqa")
1106
+ def get_med_mcqa_spec() -> RunSpec:
1107
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_mcqa_scenario.MedMCQAScenario", args={})
1108
+
1109
+ adapter_spec = get_multiple_choice_adapter_spec(
1110
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
1111
+ instructions="Give a letter answer among A, B, C or D.",
1112
+ input_noun="Question",
1113
+ output_noun="Answer",
1114
+ )
1115
+
1116
+ return RunSpec(
1117
+ name="med_mcqa",
1118
+ scenario_spec=scenario_spec,
1119
+ adapter_spec=adapter_spec,
1120
+ metric_specs=get_exact_match_metric_specs(),
1121
+ groups=["med_mcqa"],
1122
+ )
1123
+
1124
+
1125
+ @run_spec_function("med_paragraph_simplification")
1126
+ def get_med_paragraph_simplification_spec() -> RunSpec:
1127
+ scenario_spec = ScenarioSpec(
1128
+ class_name="helm.benchmark.scenarios.med_paragraph_simplification_scenario.MedParagraphSimplificationScenario",
1129
+ args={},
1130
+ )
1131
+
1132
+ adapter_spec = get_summarization_adapter_spec(
1133
+ num_sents=10,
1134
+ max_tokens=512,
1135
+ temperature=0.3,
1136
+ )
1137
+
1138
+ return RunSpec(
1139
+ name="med_paragraph_simplification",
1140
+ scenario_spec=scenario_spec,
1141
+ adapter_spec=adapter_spec,
1142
+ metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1143
+ groups=["MedParagraphSimplification"],
1144
+ )
1145
+
1146
+
1147
+ @run_spec_function("pubmed_qa")
1148
+ def get_pubmed_qa_spec() -> RunSpec:
1149
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
1150
+
1151
+ adapter_spec = get_multiple_choice_adapter_spec(
1152
+ method=ADAPT_MULTIPLE_CHOICE_JOINT,
1153
+ instructions="Answer A for yes, B for no or C for maybe.",
1154
+ input_noun="Question",
1155
+ output_noun="Answer",
1156
+ )
1157
+
1158
+ return RunSpec(
1159
+ name="pubmed_qa",
1160
+ scenario_spec=scenario_spec,
1161
+ adapter_spec=adapter_spec,
1162
+ metric_specs=get_exact_match_metric_specs(),
1163
+ groups=["pubmed_qa"],
1164
+ )
1165
+
1166
+
1167
+ @run_spec_function("live_qa")
1168
+ def get_live_qa_spec() -> RunSpec:
1169
+ from helm.common.gpu_utils import get_torch_device_name
1170
+
1171
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
1172
+
1173
+ adapter_spec = get_generation_adapter_spec(
1174
+ instructions="Please answer the following consumer health question.",
1175
+ input_noun="Question",
1176
+ output_noun="Answer",
1177
+ max_train_instances=0,
1178
+ max_tokens=512,
1179
+ )
1180
+
1181
+ return RunSpec(
1182
+ name="live_qa",
1183
+ scenario_spec=scenario_spec,
1184
+ adapter_spec=adapter_spec,
1185
+ metric_specs=get_summarization_metric_specs(
1186
+ {"task": "live_qa", "device": get_torch_device_name()},
1187
+ ),
1188
+ groups=["live_qa"],
1189
+ )
1190
+
1191
+
1192
+ @run_spec_function("medication_qa")
1193
+ def get_medication_qa_spec() -> RunSpec:
1194
+ from helm.common.gpu_utils import get_torch_device_name
1195
+
1196
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
1197
+
1198
+ adapter_spec = get_generation_adapter_spec(
1199
+ instructions="Please answer the following consumer health question.",
1200
+ input_noun="Question",
1201
+ output_noun="Answer",
1202
+ max_train_instances=0,
1203
+ max_tokens=512,
1204
+ )
1205
+
1206
+ return RunSpec(
1207
+ name="medication_qa",
1208
+ scenario_spec=scenario_spec,
1209
+ adapter_spec=adapter_spec,
1210
+ metric_specs=get_summarization_metric_specs(
1211
+ {"task": "medication_qa", "device": get_torch_device_name()},
1212
+ ),
1213
+ groups=["medication_qa"],
1214
+ )
1215
+
1216
+
1217
+ @run_spec_function("lextreme")
1218
+ def get_lextreme_spec(subset: str) -> RunSpec:
1219
+ from helm.benchmark.scenarios.lextreme_scenario import (
1220
+ get_lextreme_instructions,
1221
+ get_lextreme_max_train_instances,
1222
+ get_lextreme_max_tokens,
1223
+ TaskType,
1224
+ get_lextreme_task_type,
1225
+ )
1226
+
1227
+ task_type = get_lextreme_task_type(subset)
1228
+
1229
+ scenario_spec = ScenarioSpec(
1230
+ class_name="helm.benchmark.scenarios.lextreme_scenario.LEXTREMEScenario",
1231
+ args={"subset": subset},
1232
+ )
1233
+
1234
+ adapter_spec = get_generation_adapter_spec(
1235
+ instructions=get_lextreme_instructions(subset),
1236
+ input_noun="Passage",
1237
+ output_noun="Answer",
1238
+ max_tokens=get_lextreme_max_tokens(subset),
1239
+ max_train_instances=get_lextreme_max_train_instances(subset), # in some subsets the input is very long
1240
+ multi_label=(task_type == TaskType.MLTC),
1241
+ )
1242
+
1243
+ metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
1244
+ if task_type == TaskType.MLTC:
1245
+ metric_specs += get_classification_metric_specs(delimiter=", ")
1246
+ elif task_type == TaskType.SLTC:
1247
+ metric_specs += get_classification_metric_specs()
1248
+
1249
+ return RunSpec(
1250
+ name=f"lextreme:subset={subset}",
1251
+ scenario_spec=scenario_spec,
1252
+ adapter_spec=adapter_spec,
1253
+ metric_specs=metric_specs,
1254
+ groups=["lextreme"],
1255
+ )
1256
+
1257
+
1258
+ @run_spec_function("lex_glue")
1259
+ def get_lex_glue_spec(subset: str) -> RunSpec:
1260
+ from helm.benchmark.scenarios.lex_glue_scenario import (
1261
+ get_lex_glue_instructions,
1262
+ get_lex_glue_max_tokens,
1263
+ get_lex_glue_max_train_instances,
1264
+ get_lex_glue_task_type,
1265
+ )
1266
+ from helm.benchmark.scenarios.lextreme_scenario import TaskType
1267
+
1268
+ task_type = get_lex_glue_task_type(subset)
1269
+
1270
+ scenario_spec = ScenarioSpec(
1271
+ class_name="helm.benchmark.scenarios.lex_glue_scenario.LexGLUEScenario",
1272
+ args={"subset": subset},
1273
+ )
1274
+
1275
+ adapter_spec = get_generation_adapter_spec(
1276
+ instructions=get_lex_glue_instructions(subset),
1277
+ input_noun="Passage",
1278
+ output_noun="Answer",
1279
+ max_tokens=get_lex_glue_max_tokens(subset),
1280
+ max_train_instances=get_lex_glue_max_train_instances(subset), # in some subsets the input is very long
1281
+ multi_label=(task_type == TaskType.MLTC),
1282
+ )
1283
+
1284
+ metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
1285
+ if task_type == TaskType.MLTC:
1286
+ metric_specs += get_classification_metric_specs(delimiter=", ")
1287
+ elif task_type == TaskType.SLTC:
1288
+ metric_specs += get_classification_metric_specs()
1289
+
1290
+ return RunSpec(
1291
+ name=f"lex_glue:subset={subset}",
1292
+ scenario_spec=scenario_spec,
1293
+ adapter_spec=adapter_spec,
1294
+ metric_specs=metric_specs,
1295
+ groups=["lex_glue"],
1296
+ )
1297
+
1298
+
1299
+ @run_spec_function("billsum_legal_summarization")
1300
+ def get_billsum_legal_summarization_spec(temperature: float = 0.3, device: str = "cpu") -> RunSpec:
1301
+ scenario_spec = ScenarioSpec(
1302
+ class_name="helm.benchmark.scenarios.legal_summarization_scenario.LegalSummarizationScenario",
1303
+ args={
1304
+ "dataset_name": "BillSum",
1305
+ "sampling_min_length": 200,
1306
+ "sampling_max_length": 800, # 2000 would be ideal, but for economic reasons set it lower
1307
+ "doc_max_length": 2048, # 4096 would be ideal, but for economic reasons set it lower
1308
+ },
1309
+ )
1310
+
1311
+ adapter_spec = get_summarization_adapter_spec(
1312
+ num_sents=None,
1313
+ max_tokens=1024, # From Kornilova & Eidelmann, 2020 (https://arxiv.org/pdf/1910.00523.pdf)
1314
+ temperature=temperature, # similar to other summarization tasks
1315
+ )
1316
+
1317
+ return RunSpec(
1318
+ name=f"legal_summarization:temperature={temperature},device={device}",
1319
+ scenario_spec=scenario_spec,
1320
+ adapter_spec=adapter_spec,
1321
+ metric_specs=get_summarization_metric_specs({"task": "billsum_legal_summarization", "device": device})
1322
+ + get_generative_harms_metric_specs(),
1323
+ groups=["legal_summarization", "summarization"],
1324
+ )
1325
+
1326
+
1327
+ @run_spec_function("multilexsum_legal_summarization")
1328
+ def get_multilexsum_legal_summarization_spec(temperature: float = 0.3, device: str = "cpu") -> RunSpec:
1329
+ scenario_spec = ScenarioSpec(
1330
+ class_name="helm.benchmark.scenarios.legal_summarization_scenario.LegalSummarizationScenario",
1331
+ args={
1332
+ "dataset_name": "MultiLexSum",
1333
+ "sampling_min_length": 100,
1334
+ "sampling_max_length": 400, # 1000 would be ideal, but for economic reasons set it lower
1335
+ "doc_max_length": 1024, # 2048 would be ideal, but for economic reasons set it lower
1336
+ },
1337
+ )
1338
+
1339
+ adapter_spec = get_summarization_adapter_spec(
1340
+ num_sents=2,
1341
+ max_tokens=256, # From Shen et al., 2022 (https://arxiv.org/pdf/2206.10883.pdf)
1342
+ temperature=temperature, # similar to other summarization tasks
1343
+ )
1344
+
1345
+ return RunSpec(
1346
+ name=f"legal_summarization:temperature={temperature},device={device}",
1347
+ scenario_spec=scenario_spec,
1348
+ adapter_spec=adapter_spec,
1349
+ metric_specs=get_summarization_metric_specs({"task": "multilexsum_legal_summarization", "device": device})
1350
+ + get_generative_harms_metric_specs(),
1351
+ groups=["legal_summarization", "summarization"],
1352
+ )
1353
+
1354
+
1355
+ @run_spec_function("eurlexsum_legal_summarization")
1356
+ def get_eurlexsum_legal_summarization_spec(temperature: float = 0.3, device: str = "cpu") -> RunSpec:
1357
+ scenario_spec = ScenarioSpec(
1358
+ class_name="helm.benchmark.scenarios.legal_summarization_scenario.LegalSummarizationScenario",
1359
+ args={
1360
+ "dataset_name": "EurLexSum",
1361
+ "sampling_min_length": 400,
1362
+ "sampling_max_length": 1600, # 4000 would be ideal, but for economic reasons set it lower
1363
+ "doc_max_length": 2048, # 8192 would be ideal, but for economic reasons set it lower
1364
+ },
1365
+ )
1366
+
1367
+ adapter_spec = get_summarization_adapter_spec(
1368
+ num_sents=None,
1369
+ max_tokens=2048, # From Aumiller et al., 2022 (https://arxiv.org/pdf/2210.13448.pdf)
1370
+ temperature=temperature, # similar to other summarization tasks
1371
+ )
1372
+
1373
+ return RunSpec(
1374
+ name=f"legal_summarization:temperature={temperature},device={device}",
1375
+ scenario_spec=scenario_spec,
1376
+ adapter_spec=adapter_spec,
1377
+ metric_specs=get_summarization_metric_specs({"task": "eurlexsum_legal_summarization", "device": device})
1378
+ + get_generative_harms_metric_specs(),
1379
+ groups=["legal_summarization", "summarization"],
1380
+ )
1381
+
1382
+
1383
+ @run_spec_function("verifiability_judgment")
1384
+ def get_verifiability_judgment_spec() -> RunSpec:
1385
+ scenario_spec = ScenarioSpec(
1386
+ class_name="helm.benchmark.scenarios.verifiability_judgment_scenario.VerifiabilityJudgementScenario", args={}
1387
+ )
1388
+
1389
+ adapter_spec = get_generation_adapter_spec(
1390
+ instructions=(
1391
+ 'Given the statement and its source, judge whether the source "fully supports", '
1392
+ '"partially supports" or "does not support" the statement.'
1393
+ ),
1394
+ input_noun="Statement",
1395
+ # Add another new line before the output noun, since the source might have
1396
+ # newlines embedded in it.
1397
+ output_noun="\nJudgment",
1398
+ max_tokens=10,
1399
+ )
1400
+
1401
+ return RunSpec(
1402
+ name="verifiability_judgment",
1403
+ scenario_spec=scenario_spec,
1404
+ adapter_spec=adapter_spec,
1405
+ metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match"]),
1406
+ groups=["verifiability_judgment"],
1407
+ )
1408
+
1409
+
1410
+ @run_spec_function("opinions_qa")
1411
+ def get_opinions_qa_spec(
1412
+ survey_type: str,
1413
+ num_logprobs: str,
1414
+ context: str = "None",
1415
+ num_train_trials: str = "1",
1416
+ method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
1417
+ ) -> RunSpec:
1418
+ scenario_spec = ScenarioSpec(
1419
+ class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
1420
+ args={"survey_type": survey_type, "context": context},
1421
+ )
1422
+
1423
+ adapter_spec = get_multiple_choice_adapter_spec(
1424
+ method=method,
1425
+ instructions="",
1426
+ input_noun="Question",
1427
+ output_noun="Answer",
1428
+ max_train_instances=1 if "steer" in context else 0,
1429
+ max_tokens=1,
1430
+ num_outputs=int(num_logprobs),
1431
+ num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
1432
+ sample_train=False,
1433
+ )
1434
+
1435
+ return RunSpec(
1436
+ name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
1437
+ + f",context={context},num_train_trials={num_train_trials}",
1438
+ scenario_spec=scenario_spec,
1439
+ adapter_spec=adapter_spec,
1440
+ metric_specs=[],
1441
+ groups=["opinions_qa"],
1442
+ )
1443
+
1444
+
1445
+ @run_spec_function("lm_entry")
1446
+ def get_lm_entry_spec(task: str, method: str = ADAPT_GENERATION) -> RunSpec:
1447
+ scenario_spec = ScenarioSpec(
1448
+ class_name="helm.benchmark.scenarios.lm_entry_scenario.LMEntryScenario",
1449
+ args={"task": task},
1450
+ )
1451
+ adapter_spec: AdapterSpec
1452
+ metric_specs: List[MetricSpec]
1453
+
1454
+ if method == ADAPT_MULTIPLE_CHOICE_JOINT:
1455
+ if task in ["first_letter", "last_letter", "first_word", "last_word", "word_before", "word_after"]:
1456
+ raise ValueError(f"Task {task} cannot be cast to multiple choice.")
1457
+
1458
+ adapter_spec = get_multiple_choice_adapter_spec(
1459
+ method=method,
1460
+ instructions="Answer the following multiple choice question with a single letter",
1461
+ input_noun="Question",
1462
+ output_noun="\nAnswer",
1463
+ )
1464
+ metric_specs = get_exact_match_metric_specs()
1465
+ elif method == ADAPT_GENERATION:
1466
+ adapter_spec = get_generation_adapter_spec(
1467
+ instructions="Answer the following question in one word.",
1468
+ input_noun="Q",
1469
+ output_noun="\nA",
1470
+ # Shouldn't use any stop sequences because the task is zero-shot and thus we
1471
+ # don't expect the model to magically figure out the output format.
1472
+ stop_sequences=[],
1473
+ # Set max_tokens to save tokens. The answer is a word so 10 tokens should suffice.
1474
+ max_tokens=10,
1475
+ )
1476
+ # It makes no sense to include non-quasi exact match metrics for this task.
1477
+ metric_specs = get_basic_metric_specs(["quasi_exact_match", "quasi_prefix_exact_match", "f1_score"])
1478
+ else:
1479
+ raise ValueError(f"Unknown method: {method}")
1480
+
1481
+ return RunSpec(
1482
+ name=f"lm_entry:task={task},method={method}",
1483
+ scenario_spec=scenario_spec,
1484
+ adapter_spec=adapter_spec,
1485
+ metric_specs=metric_specs,
1486
+ groups=["lm_entry"],
1487
+ )
1488
+
1489
+
1490
+ @run_spec_function("thai_exam")
1491
+ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
1492
+ scenario_spec = ScenarioSpec(
1493
+ class_name="helm.benchmark.scenarios.thai_exam_scenario.ThaiExamScenario", args={"exam": exam}
1494
+ )
1495
+
1496
+ adapter_spec = get_multiple_choice_adapter_spec(
1497
+ method=method,
1498
+ instructions="The following are multiple choice questions (with answers).",
1499
+ input_noun="Question",
1500
+ output_noun="Answer",
1501
+ max_train_instances=5,
1502
+ )
1503
+
1504
+ return RunSpec(
1505
+ name=f"thai_exam:exam={exam},method={method}",
1506
+ scenario_spec=scenario_spec,
1507
+ adapter_spec=adapter_spec,
1508
+ metric_specs=get_exact_match_metric_specs(),
1509
+ groups=["thai_exam"],
1510
+ )