crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
  2. crfm_helm-0.5.0.dist-info/RECORD +642 -0
  3. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +37 -2
  5. helm/benchmark/adaptation/adapters/adapter.py +4 -42
  6. helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
  7. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
  8. helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
  9. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
  10. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
  11. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
  12. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  13. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
  14. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
  15. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
  16. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
  17. helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
  18. helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
  19. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
  20. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
  21. helm/benchmark/adaptation/common_adapter_specs.py +376 -0
  22. helm/benchmark/adaptation/prompt.py +7 -1
  23. helm/benchmark/adaptation/request_state.py +6 -1
  24. helm/benchmark/adaptation/scenario_state.py +6 -2
  25. helm/benchmark/annotation/annotator.py +43 -0
  26. helm/benchmark/annotation/annotator_factory.py +61 -0
  27. helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
  28. helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
  29. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
  30. helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
  31. helm/benchmark/annotation/test_annotator_factory.py +26 -0
  32. helm/benchmark/annotation/test_dummy_annotator.py +44 -0
  33. helm/benchmark/annotation_executor.py +124 -0
  34. helm/benchmark/augmentations/cleva_perturbation.py +7 -14
  35. helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
  36. helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
  37. helm/benchmark/augmentations/data_augmenter.py +0 -2
  38. helm/benchmark/augmentations/dialect_perturbation.py +2 -2
  39. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  40. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  41. helm/benchmark/augmentations/gender_perturbation.py +3 -3
  42. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  43. helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
  44. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  45. helm/benchmark/augmentations/person_name_perturbation.py +0 -7
  46. helm/benchmark/augmentations/perturbation.py +20 -7
  47. helm/benchmark/augmentations/perturbation_description.py +1 -1
  48. helm/benchmark/augmentations/space_perturbation.py +2 -2
  49. helm/benchmark/augmentations/suffix_perturbation.py +29 -0
  50. helm/benchmark/augmentations/synonym_perturbation.py +2 -2
  51. helm/benchmark/augmentations/test_perturbation.py +11 -7
  52. helm/benchmark/augmentations/translate_perturbation.py +30 -0
  53. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  54. helm/benchmark/config_registry.py +38 -0
  55. helm/benchmark/executor.py +46 -16
  56. helm/benchmark/huggingface_registration.py +37 -7
  57. helm/benchmark/metrics/basic_metrics.py +172 -641
  58. helm/benchmark/metrics/bbq_metrics.py +3 -4
  59. helm/benchmark/metrics/bias_metrics.py +6 -6
  60. helm/benchmark/metrics/classification_metrics.py +11 -8
  61. helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
  62. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  63. helm/benchmark/metrics/code_metrics.py +4 -3
  64. helm/benchmark/metrics/code_metrics_helper.py +0 -2
  65. helm/benchmark/metrics/common_metric_specs.py +167 -0
  66. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
  67. helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
  68. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
  69. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
  70. helm/benchmark/metrics/disinformation_metrics.py +6 -112
  71. helm/benchmark/metrics/dry_run_metrics.py +5 -3
  72. helm/benchmark/metrics/efficiency_metrics.py +206 -0
  73. helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
  74. helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
  75. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
  76. helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
  77. helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
  78. helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
  79. helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
  80. helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
  81. helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
  82. helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
  83. helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
  84. helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
  85. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
  86. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
  87. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
  88. helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
  89. helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
  90. helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
  91. helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
  92. helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
  93. helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
  94. helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
  95. helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
  96. helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
  97. helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
  98. helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
  99. helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
  100. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
  101. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
  102. helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
  103. helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
  104. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
  105. helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
  106. helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
  107. helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
  108. helm/benchmark/metrics/language_modeling_metrics.py +99 -0
  109. helm/benchmark/metrics/machine_translation_metrics.py +5 -5
  110. helm/benchmark/metrics/metric.py +93 -172
  111. helm/benchmark/metrics/metric_name.py +0 -1
  112. helm/benchmark/metrics/metric_service.py +16 -0
  113. helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
  114. helm/benchmark/metrics/ranking_metrics.py +6 -7
  115. helm/benchmark/metrics/reference_metric.py +148 -0
  116. helm/benchmark/metrics/summac/model_summac.py +0 -2
  117. helm/benchmark/metrics/summarization_metrics.py +8 -8
  118. helm/benchmark/metrics/test_classification_metrics.py +9 -6
  119. helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
  120. helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
  121. helm/benchmark/metrics/test_metric.py +2 -2
  122. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
  123. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
  124. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  125. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
  126. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
  127. helm/benchmark/metrics/toxicity_metrics.py +1 -1
  128. helm/benchmark/metrics/toxicity_utils.py +23 -0
  129. helm/benchmark/metrics/unitxt_metrics.py +81 -0
  130. helm/benchmark/metrics/vision_language/__init__.py +0 -0
  131. helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
  132. helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
  133. helm/benchmark/metrics/vision_language/image_utils.py +100 -0
  134. helm/benchmark/model_deployment_registry.py +164 -41
  135. helm/benchmark/model_metadata_registry.py +181 -35
  136. helm/benchmark/multi_gpu_runner.py +133 -0
  137. helm/benchmark/presentation/contamination.py +3 -3
  138. helm/benchmark/presentation/create_plots.py +8 -7
  139. helm/benchmark/presentation/run_display.py +50 -17
  140. helm/benchmark/presentation/schema.py +28 -46
  141. helm/benchmark/presentation/summarize.py +213 -96
  142. helm/benchmark/presentation/table.py +8 -8
  143. helm/benchmark/presentation/test_contamination.py +2 -2
  144. helm/benchmark/presentation/test_run_entry.py +14 -9
  145. helm/benchmark/presentation/test_summarize.py +5 -0
  146. helm/benchmark/run.py +66 -54
  147. helm/benchmark/run_expander.py +342 -31
  148. helm/benchmark/run_spec.py +93 -0
  149. helm/benchmark/run_spec_factory.py +162 -0
  150. helm/benchmark/run_specs/__init__.py +0 -0
  151. helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
  152. helm/benchmark/run_specs/cleva_run_specs.py +277 -0
  153. helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
  154. helm/benchmark/run_specs/heim_run_specs.py +623 -0
  155. helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
  156. helm/benchmark/run_specs/lite_run_specs.py +307 -0
  157. helm/benchmark/run_specs/simple_run_specs.py +104 -0
  158. helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
  159. helm/benchmark/run_specs/vlm_run_specs.py +501 -0
  160. helm/benchmark/runner.py +116 -69
  161. helm/benchmark/runner_config_registry.py +21 -0
  162. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  163. helm/benchmark/scenarios/bold_scenario.py +2 -2
  164. helm/benchmark/scenarios/cleva_scenario.py +43 -46
  165. helm/benchmark/scenarios/code_scenario.py +3 -2
  166. helm/benchmark/scenarios/commonsense_scenario.py +171 -191
  167. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
  168. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
  169. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
  170. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
  171. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
  172. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
  173. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
  174. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
  175. helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
  176. helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
  177. helm/benchmark/scenarios/image_generation/__init__.py +0 -0
  178. helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
  179. helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
  180. helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
  181. helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
  182. helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
  183. helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
  184. helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
  185. helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
  186. helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
  187. helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
  188. helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
  189. helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
  190. helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
  191. helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
  192. helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
  193. helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
  194. helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
  195. helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
  196. helm/benchmark/scenarios/imdb_scenario.py +0 -1
  197. helm/benchmark/scenarios/legalbench_scenario.py +123 -0
  198. helm/benchmark/scenarios/live_qa_scenario.py +94 -0
  199. helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
  200. helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
  201. helm/benchmark/scenarios/math_scenario.py +19 -2
  202. helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
  203. helm/benchmark/scenarios/numeracy_scenario.py +3 -3
  204. helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
  205. helm/benchmark/scenarios/raft_scenario.py +2 -6
  206. helm/benchmark/scenarios/scenario.py +14 -2
  207. helm/benchmark/scenarios/simple_scenarios.py +122 -1
  208. helm/benchmark/scenarios/test_math_scenario.py +22 -0
  209. helm/benchmark/scenarios/test_scenario.py +6 -3
  210. helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
  211. helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
  212. helm/benchmark/scenarios/the_pile_scenario.py +6 -7
  213. helm/benchmark/scenarios/unitxt_scenario.py +56 -0
  214. helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
  215. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  216. helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
  217. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
  218. helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
  219. helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
  220. helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
  221. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
  222. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
  223. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
  224. helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
  225. helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
  226. helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
  227. helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
  228. helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
  229. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
  230. helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
  231. helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
  232. helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
  233. helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
  234. helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
  235. helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
  236. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
  237. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
  238. helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
  239. helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
  240. helm/benchmark/server.py +59 -2
  241. helm/benchmark/slurm_jobs.py +12 -0
  242. helm/benchmark/slurm_runner.py +79 -51
  243. helm/benchmark/static/benchmarking.js +3 -4
  244. helm/benchmark/static/contamination.yaml +1 -1
  245. helm/benchmark/static/images/organizations/together.png +0 -0
  246. helm/benchmark/static/json-urls.js +4 -0
  247. helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
  248. helm/benchmark/static/schema_instruction_following.yaml +210 -0
  249. helm/benchmark/static/schema_lite.yaml +824 -0
  250. helm/benchmark/static/schema_mmlu.yaml +1507 -0
  251. helm/benchmark/static/schema_unitxt.yaml +428 -0
  252. helm/benchmark/static/schema_vlm.yaml +576 -0
  253. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  254. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  255. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  256. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  257. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  258. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  259. helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
  260. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  261. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  262. helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
  263. helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
  264. helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
  265. helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
  266. helm/benchmark/static_build/assets/index-d839df55.js +9 -0
  267. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  268. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  269. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  270. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  271. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  272. helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
  273. helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
  274. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  275. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  276. helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
  277. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  278. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  279. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  280. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  281. helm/benchmark/static_build/config.js +4 -0
  282. helm/benchmark/static_build/index.html +20 -0
  283. helm/benchmark/test_data_preprocessor.py +3 -3
  284. helm/benchmark/test_model_deployment_definition.py +90 -0
  285. helm/benchmark/test_run_expander.py +1 -1
  286. helm/benchmark/tokenizer_config_registry.py +10 -14
  287. helm/benchmark/window_services/ai21_window_service.py +22 -33
  288. helm/benchmark/window_services/cohere_window_service.py +1 -63
  289. helm/benchmark/window_services/default_window_service.py +2 -35
  290. helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
  291. helm/benchmark/window_services/ice_window_service.py +0 -34
  292. helm/benchmark/window_services/image_generation/__init__.py +0 -0
  293. helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
  294. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
  295. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
  296. helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
  297. helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
  298. helm/benchmark/window_services/local_window_service.py +21 -4
  299. helm/benchmark/window_services/no_decoding_window_service.py +32 -0
  300. helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
  301. helm/benchmark/window_services/test_bloom_window_service.py +2 -1
  302. helm/benchmark/window_services/test_cohere_window_service.py +2 -1
  303. helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
  304. helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
  305. helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
  306. helm/benchmark/window_services/test_gptj_window_service.py +3 -2
  307. helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
  308. helm/benchmark/window_services/test_ice_window_service.py +2 -1
  309. helm/benchmark/window_services/test_openai_window_service.py +2 -1
  310. helm/benchmark/window_services/test_opt_window_service.py +3 -2
  311. helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
  312. helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
  313. helm/benchmark/window_services/test_t511b_window_service.py +2 -1
  314. helm/benchmark/window_services/test_ul2_window_service.py +2 -1
  315. helm/benchmark/window_services/test_utils.py +3 -2
  316. helm/benchmark/window_services/test_yalm_window_service.py +2 -1
  317. helm/benchmark/window_services/window_service.py +42 -0
  318. helm/benchmark/window_services/window_service_factory.py +24 -269
  319. helm/benchmark/window_services/yalm_window_service.py +0 -27
  320. helm/clients/__init__.py +0 -0
  321. helm/{proxy/clients → clients}/ai21_client.py +5 -12
  322. helm/clients/aleph_alpha_client.py +112 -0
  323. helm/{proxy/clients → clients}/anthropic_client.py +213 -24
  324. helm/clients/auto_client.py +215 -0
  325. helm/clients/bedrock_client.py +128 -0
  326. helm/clients/bedrock_utils.py +72 -0
  327. helm/{proxy/clients → clients}/client.py +67 -55
  328. helm/clients/clip_score_client.py +49 -0
  329. helm/clients/clip_scorers/__init__.py +0 -0
  330. helm/clients/clip_scorers/base_clip_scorer.py +18 -0
  331. helm/clients/clip_scorers/clip_scorer.py +50 -0
  332. helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
  333. helm/{proxy/clients → clients}/cohere_client.py +6 -17
  334. helm/clients/gcs_client.py +82 -0
  335. helm/{proxy/clients → clients}/google_client.py +7 -8
  336. helm/clients/google_translate_client.py +35 -0
  337. helm/{proxy/clients → clients}/http_model_client.py +6 -10
  338. helm/{proxy/clients → clients}/huggingface_client.py +134 -92
  339. helm/clients/image_generation/__init__.py +0 -0
  340. helm/clients/image_generation/adobe_vision_client.py +78 -0
  341. helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
  342. helm/clients/image_generation/cogview2/__init__.py +0 -0
  343. helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
  344. helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
  345. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
  346. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
  347. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
  348. helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
  349. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
  350. helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
  351. helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
  352. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
  353. helm/clients/image_generation/cogview2_client.py +191 -0
  354. helm/clients/image_generation/dalle2_client.py +192 -0
  355. helm/clients/image_generation/dalle3_client.py +108 -0
  356. helm/clients/image_generation/dalle_mini/__init__.py +3 -0
  357. helm/clients/image_generation/dalle_mini/data.py +442 -0
  358. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
  359. helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
  360. helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
  361. helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
  362. helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
  363. helm/clients/image_generation/dalle_mini/model/text.py +251 -0
  364. helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
  365. helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
  366. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
  367. helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
  368. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
  369. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
  370. helm/clients/image_generation/dalle_mini_client.py +190 -0
  371. helm/clients/image_generation/deep_floyd_client.py +78 -0
  372. helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
  373. helm/clients/image_generation/image_generation_client_utils.py +9 -0
  374. helm/clients/image_generation/lexica_client.py +86 -0
  375. helm/clients/image_generation/mindalle/__init__.py +0 -0
  376. helm/clients/image_generation/mindalle/models/__init__.py +216 -0
  377. helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
  378. helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
  379. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
  380. helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
  381. helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
  382. helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
  383. helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
  384. helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
  385. helm/clients/image_generation/mindalle/utils/config.py +129 -0
  386. helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
  387. helm/clients/image_generation/mindalle/utils/utils.py +89 -0
  388. helm/clients/image_generation/mindalle_client.py +115 -0
  389. helm/clients/image_generation/nudity_check_client.py +64 -0
  390. helm/clients/image_generation/together_image_generation_client.py +111 -0
  391. helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
  392. helm/{proxy/clients → clients}/megatron_client.py +13 -7
  393. helm/clients/mistral_client.py +134 -0
  394. helm/clients/moderation_api_client.py +109 -0
  395. helm/clients/open_lm_client.py +43 -0
  396. helm/clients/openai_client.py +302 -0
  397. helm/{proxy/clients → clients}/palmyra_client.py +15 -12
  398. helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
  399. helm/clients/simple_client.py +64 -0
  400. helm/{proxy/clients → clients}/test_auto_client.py +15 -15
  401. helm/clients/test_client.py +100 -0
  402. helm/clients/test_huggingface_client.py +70 -0
  403. helm/clients/test_simple_client.py +19 -0
  404. helm/{proxy/clients → clients}/test_together_client.py +23 -12
  405. helm/{proxy/clients → clients}/together_client.py +18 -71
  406. helm/clients/vertexai_client.py +391 -0
  407. helm/clients/vision_language/__init__.py +0 -0
  408. helm/clients/vision_language/huggingface_vlm_client.py +104 -0
  409. helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
  410. helm/clients/vision_language/open_flamingo/__init__.py +2 -0
  411. helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
  412. helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
  413. helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
  414. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
  415. helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
  416. helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
  417. helm/clients/vision_language/open_flamingo_client.py +155 -0
  418. helm/clients/vision_language/qwen_vlm_client.py +171 -0
  419. helm/clients/vllm_client.py +46 -0
  420. helm/common/cache.py +24 -179
  421. helm/common/cache_backend_config.py +47 -0
  422. helm/common/clip_score_request.py +41 -0
  423. helm/common/concurrency.py +32 -0
  424. helm/common/credentials_utils.py +28 -0
  425. helm/common/file_caches/__init__.py +0 -0
  426. helm/common/file_caches/file_cache.py +16 -0
  427. helm/common/file_caches/local_file_cache.py +61 -0
  428. helm/common/file_caches/test_local_file_cache.py +25 -0
  429. helm/common/file_upload_request.py +27 -0
  430. helm/common/general.py +29 -10
  431. helm/common/image_generation_parameters.py +25 -0
  432. helm/common/images_utils.py +24 -1
  433. helm/common/key_value_store.py +113 -0
  434. helm/common/media_object.py +13 -0
  435. helm/common/moderations_api_request.py +71 -0
  436. helm/common/mongo_key_value_store.py +88 -0
  437. helm/common/multimodal_request_utils.py +31 -0
  438. helm/common/nudity_check_request.py +29 -0
  439. helm/common/object_spec.py +2 -2
  440. helm/common/request.py +36 -27
  441. helm/common/test_general.py +6 -0
  442. helm/common/tokenization_request.py +6 -3
  443. helm/config/__init__.py +0 -0
  444. helm/config/model_deployments.yaml +1942 -0
  445. helm/config/model_metadata.yaml +2201 -0
  446. helm/config/tokenizer_configs.yaml +362 -0
  447. helm/proxy/accounts.py +31 -4
  448. helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
  449. helm/proxy/critique/model_critique_client.py +13 -5
  450. helm/proxy/example_queries.py +29 -17
  451. helm/proxy/retry.py +8 -2
  452. helm/proxy/server.py +77 -5
  453. helm/proxy/services/remote_service.py +31 -0
  454. helm/proxy/services/server_service.py +103 -20
  455. helm/proxy/services/service.py +34 -2
  456. helm/proxy/services/test_remote_service.py +7 -6
  457. helm/proxy/services/test_service.py +27 -18
  458. helm/proxy/test_accounts.py +32 -0
  459. helm/proxy/token_counters/auto_token_counter.py +37 -37
  460. helm/proxy/token_counters/test_auto_token_counter.py +164 -0
  461. helm/proxy/token_counters/token_counter.py +3 -5
  462. helm/py.typed +0 -0
  463. helm/tokenizers/__init__.py +0 -0
  464. helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
  465. helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
  466. helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
  467. helm/tokenizers/auto_tokenizer.py +93 -0
  468. helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
  469. helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
  470. helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
  471. helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
  472. helm/tokenizers/simple_tokenizer.py +33 -0
  473. helm/tokenizers/test_anthropic_tokenizer.py +82 -0
  474. helm/tokenizers/test_huggingface_tokenizer.py +136 -0
  475. helm/tokenizers/test_simple_tokenizer.py +33 -0
  476. helm/tokenizers/vertexai_tokenizer.py +97 -0
  477. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
  478. helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
  479. helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
  480. helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
  481. crfm_helm-0.3.0.dist-info/RECORD +0 -396
  482. helm/benchmark/vlm_run_specs.py +0 -71
  483. helm/benchmark/window_services/anthropic_window_service.py +0 -68
  484. helm/benchmark/window_services/bloom_window_service.py +0 -35
  485. helm/benchmark/window_services/flan_t5_window_service.py +0 -29
  486. helm/benchmark/window_services/gpt2_window_service.py +0 -32
  487. helm/benchmark/window_services/gptj_window_service.py +0 -38
  488. helm/benchmark/window_services/gptneox_window_service.py +0 -41
  489. helm/benchmark/window_services/http_model_window_service.py +0 -28
  490. helm/benchmark/window_services/huggingface_window_service.py +0 -59
  491. helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
  492. helm/benchmark/window_services/llama_window_service.py +0 -28
  493. helm/benchmark/window_services/luminous_window_service.py +0 -67
  494. helm/benchmark/window_services/megatron_window_service.py +0 -10
  495. helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
  496. helm/benchmark/window_services/openai_window_service.py +0 -13
  497. helm/benchmark/window_services/opt_window_service.py +0 -35
  498. helm/benchmark/window_services/palmyra_window_service.py +0 -45
  499. helm/benchmark/window_services/remote_window_service.py +0 -48
  500. helm/benchmark/window_services/santacoder_window_service.py +0 -27
  501. helm/benchmark/window_services/starcoder_window_service.py +0 -27
  502. helm/benchmark/window_services/t0pp_window_service.py +0 -35
  503. helm/benchmark/window_services/t511b_window_service.py +0 -30
  504. helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
  505. helm/benchmark/window_services/ul2_window_service.py +0 -30
  506. helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
  507. helm/benchmark/window_services/wider_openai_window_service.py +0 -52
  508. helm/proxy/clients/aleph_alpha_client.py +0 -99
  509. helm/proxy/clients/auto_client.py +0 -461
  510. helm/proxy/clients/goose_ai_client.py +0 -100
  511. helm/proxy/clients/microsoft_client.py +0 -182
  512. helm/proxy/clients/openai_client.py +0 -206
  513. helm/proxy/clients/remote_model_registry.py +0 -28
  514. helm/proxy/clients/simple_client.py +0 -61
  515. helm/proxy/clients/test_anthropic_client.py +0 -63
  516. helm/proxy/clients/test_client.py +0 -31
  517. helm/proxy/clients/test_huggingface_client.py +0 -87
  518. helm/proxy/models.py +0 -963
  519. helm/proxy/test_models.py +0 -27
  520. helm/proxy/token_counters/ai21_token_counter.py +0 -20
  521. helm/proxy/token_counters/cohere_token_counter.py +0 -13
  522. helm/proxy/token_counters/free_token_counter.py +0 -12
  523. helm/proxy/token_counters/gooseai_token_counter.py +0 -24
  524. helm/proxy/token_counters/openai_token_counter.py +0 -22
  525. helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
  526. helm/proxy/token_counters/test_openai_token_counter.py +0 -79
  527. helm/proxy/tokenizers/simple_tokenizer.py +0 -32
  528. helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
  529. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
  530. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
  531. {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
  532. /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
  533. /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
  534. /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
  535. /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
  536. /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
  537. /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
  538. /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
  539. /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
  540. /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
  541. /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
  542. /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
  543. /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
  544. /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
  545. /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
  546. /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
@@ -1,798 +1,51 @@
1
- import importlib
2
- import itertools
3
- from functools import partial
4
- from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
1
+ """Run spec functions for the HELM Classic leaderboard.
5
2
 
6
- from helm.common.hierarchical_logger import hlog, htrack
7
- from helm.common.object_spec import ObjectSpec
8
- from helm.benchmark.adaptation.adapters.adapter_factory import (
9
- ADAPT_LANGUAGE_MODELING,
3
+ Website: https://crfm.stanford.edu/helm/classic/
4
+
5
+ If a run spec function is included in both the HELM Classic leaderboard and the
6
+ HELM Lite leaderboard, it will be included in the lite_run_specs module instead of this module.
7
+ This module also contains some scenarios that are currently not used on any HELM leaderboard."""
8
+
9
+ from typing import Any, Dict, List, Optional, Set
10
+
11
+ from helm.benchmark.adaptation.adapter_spec import (
12
+ ADAPT_GENERATION,
10
13
  ADAPT_MULTIPLE_CHOICE_JOINT,
11
14
  ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
12
- ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
13
- ADAPT_GENERATION,
14
15
  ADAPT_RANKING_BINARY,
16
+ AdapterSpec,
15
17
  )
16
18
  from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
17
- from helm.benchmark.adaptation.adapter_spec import AdapterSpec
18
- from helm.common.optional_dependencies import handle_module_not_found_error
19
- from .metrics.metric import MetricSpec
20
- from .run_expander import (
21
- RUN_EXPANDERS,
22
- RunExpander,
23
- GlobalPrefixRunExpander,
24
- StopRunExpander,
25
- ChatMLRunExpander,
26
- AddToStopRunExpander,
27
- IncreaseMaxTokensRunExpander,
28
- FormatPromptRunExpander,
29
- IncreaseTemperatureRunExpander,
30
- )
31
- from .runner import RunSpec
32
- from .scenarios.lex_glue_scenario import (
33
- get_lex_glue_max_train_instances,
34
- get_lex_glue_instructions,
35
- get_lex_glue_max_tokens,
36
- get_lex_glue_task_type,
19
+ from helm.benchmark.adaptation.common_adapter_specs import (
20
+ get_completion_adapter_spec,
21
+ get_generation_adapter_spec,
22
+ get_language_modeling_adapter_spec,
23
+ get_multiple_choice_adapter_spec,
24
+ get_ranking_binary_adapter_spec,
25
+ get_summarization_adapter_spec,
37
26
  )
38
- from .scenarios.scenario import ScenarioSpec
39
- from .scenarios.big_bench_scenario import BIGBenchScenario
40
- from .scenarios.msmarco_scenario import MSMARCOScenario
41
- from .scenarios.copyright_scenario import datatag2hash_code
42
- from .scenarios.raft_scenario import get_raft_instructions
43
- from .scenarios.lextreme_scenario import (
44
- get_lextreme_instructions,
45
- get_lextreme_max_train_instances,
46
- get_lextreme_max_tokens,
47
- TaskType,
48
- get_lextreme_task_type,
27
+ from helm.benchmark.metrics.common_metric_specs import (
28
+ get_basic_metric_specs,
29
+ get_bias_metric_specs,
30
+ get_classification_metric_specs,
31
+ get_copyright_metric_specs,
32
+ get_disinformation_metric_specs,
33
+ get_exact_match_metric_specs,
34
+ get_f1_metric_specs,
35
+ get_generative_harms_metric_specs,
36
+ get_language_modeling_metric_specs,
37
+ get_numeracy_metric_specs,
38
+ get_open_ended_generation_metric_specs,
39
+ get_summarization_metric_specs,
40
+ get_basic_generation_metric_specs,
41
+ get_basic_reference_metric_specs,
42
+ get_generic_metric_specs,
49
43
  )
50
- from helm.proxy.models import (
51
- ANTHROPIC_CLAUDE_1_MODEL_TAG,
52
- ANTHROPIC_CLAUDE_2_MODEL_TAG,
53
- get_model,
54
- NO_NEWLINES_TAG,
55
- NLG_PREFIX_TAG,
56
- CHATML_MODEL_TAG,
57
- OPENAI_CHATGPT_MODEL_TAG,
58
- BUGGY_TEMP_0_TAG,
59
- )
60
- from helm.common.general import singleton
61
-
62
-
63
- ############################################################
64
- # Prototypical adapter specs
65
-
66
-
67
- def format_instructions(instructions: str) -> str:
68
- if len(instructions) > 0:
69
- instructions += "\n"
70
- return instructions
71
-
72
-
73
- def get_multiple_choice_joint_adapter_spec(
74
- instructions: str,
75
- input_noun: Optional[str],
76
- output_noun: str,
77
- num_outputs: int = 5,
78
- max_train_instances: int = 5,
79
- max_tokens: int = 5,
80
- sample_train: bool = True,
81
- **kwargs,
82
- ) -> AdapterSpec:
83
- """
84
- [instructions]
85
-
86
- [input_noun]: [input]
87
- [reference_1]
88
- ...
89
- [reference_k]
90
- [output_noun]: [output]
91
-
92
- [input_noun]: [input]
93
- [reference_1]
94
- ...
95
- [reference_k]
96
- [output_noun]:
97
- """
98
-
99
- return AdapterSpec(
100
- method=ADAPT_MULTIPLE_CHOICE_JOINT,
101
- instructions=format_instructions(instructions),
102
- input_prefix=f"{input_noun}: " if input_noun is not None else "",
103
- input_suffix="\n" if input_noun is not None else "",
104
- output_prefix=f"{output_noun}: ",
105
- output_suffix="\n",
106
- max_train_instances=max_train_instances,
107
- num_outputs=num_outputs,
108
- max_tokens=max_tokens,
109
- temperature=0.0,
110
- stop_sequences=["\n"],
111
- sample_train=sample_train,
112
- **kwargs,
113
- )
114
-
115
-
116
- def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
117
- """
118
- [input] [reference_i]
119
- or
120
- [reference_i]
121
- """
122
- assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
123
-
124
- return AdapterSpec(
125
- method=method,
126
- instructions="",
127
- input_prefix="",
128
- input_suffix="",
129
- output_prefix=" " if not empty_input else "",
130
- output_suffix="",
131
- # Separate is basically language modeling, so can't easily use in-context examples
132
- max_train_instances=0,
133
- num_outputs=1,
134
- max_tokens=0,
135
- temperature=0.0,
136
- )
137
-
138
-
139
- def get_multiple_choice_adapter_spec(
140
- method: str,
141
- instructions: str,
142
- input_noun: Optional[str],
143
- output_noun: str,
144
- max_train_instances: int = 5,
145
- num_outputs: int = 5,
146
- max_tokens: int = 1,
147
- empty_input: bool = False,
148
- sample_train: bool = True,
149
- **kwargs,
150
- ):
151
- """
152
- Toggle between joint and separate adapters.
153
- """
154
- if method == ADAPT_MULTIPLE_CHOICE_JOINT:
155
- return get_multiple_choice_joint_adapter_spec(
156
- instructions,
157
- input_noun,
158
- output_noun,
159
- max_train_instances=max_train_instances,
160
- num_outputs=num_outputs,
161
- max_tokens=max_tokens,
162
- sample_train=sample_train,
163
- **kwargs,
164
- )
165
- elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
166
- return get_multiple_choice_separate_adapter_spec(method, empty_input)
167
- else:
168
- raise ValueError(f"Invalid adaptation method: {method}")
169
-
170
-
171
- def get_ranking_binary_adapter_spec(
172
- instructions: str = "",
173
- document_noun: str = "Passage",
174
- query_noun: str = "Query",
175
- output_prefix: str = "Does the passage answer the query?",
176
- output_noun: str = "Answer",
177
- max_train_instances: int = 4,
178
- num_outputs: int = 1,
179
- num_train_trials: int = 1,
180
- temperature: float = 0.0,
181
- max_tokens: int = 5,
182
- **kwargs,
183
- ) -> AdapterSpec:
184
- """
185
- [instructions]
186
-
187
- [object_noun]: [object]
188
- [query_noun]: [query]
189
- [prompt_noun]: [prompt_content]
190
- [output_noun]: [output]
191
-
192
- ...
193
-
194
- [object_noun]: [object]
195
- [query_noun]: [query]
196
- [prompt_noun]: [prompt_content]
197
- [output_noun]: [output]
198
-
199
- [object_noun]: [object]
200
- [query_noun]: [query]
201
- [prompt_noun]: [prompt_content]
202
- [output_noun]: [output]
203
- """
204
- msg = (
205
- "There must be an even number of in-context examples to ensure that"
206
- "an equal number of positive and negative examples are included."
207
- )
208
- assert max_train_instances % 2 == 0, msg
209
- max_train_instances = int(max_train_instances / 2)
210
-
211
- return AdapterSpec(
212
- method=ADAPT_RANKING_BINARY,
213
- instructions=format_instructions(instructions),
214
- input_prefix=f"{query_noun}: ",
215
- input_suffix="\n",
216
- reference_prefix=f"{document_noun}: ",
217
- reference_suffix="\n",
218
- output_prefix=f"{output_prefix}\n{output_noun}: ",
219
- max_train_instances=max_train_instances,
220
- num_outputs=num_outputs,
221
- num_train_trials=num_train_trials,
222
- temperature=temperature,
223
- max_tokens=max_tokens,
224
- **kwargs,
225
- )
226
-
227
-
228
- def get_completion_adapter_spec(
229
- instructions: str = "",
230
- input_prefix: str = "",
231
- output_prefix: str = "",
232
- output_suffix: str = "",
233
- max_train_instances: int = 0,
234
- temperature: float = 0.0,
235
- num_outputs: int = 1,
236
- max_tokens: int = 100,
237
- stop_sequences: Optional[List] = None, # default value of `stop_sequences` is no stop sequence,
238
- **kwargs,
239
- ) -> AdapterSpec:
240
- """
241
- [input][output_prefix][output][output_suffix]
242
-
243
- [input][output_prefix]
244
- """
245
- if stop_sequences is None:
246
- stop_sequences = []
247
-
248
- return AdapterSpec(
249
- method=ADAPT_GENERATION,
250
- instructions=format_instructions(instructions),
251
- input_prefix=input_prefix,
252
- input_suffix="",
253
- output_prefix=output_prefix,
254
- output_suffix=output_suffix,
255
- max_train_instances=max_train_instances,
256
- temperature=temperature,
257
- num_outputs=num_outputs,
258
- max_tokens=max_tokens,
259
- stop_sequences=stop_sequences,
260
- **kwargs,
261
- )
262
-
263
-
264
- def get_generation_adapter_spec(
265
- instructions: str = "",
266
- input_noun: Optional[str] = None,
267
- newline_after_input_noun: bool = False,
268
- output_noun: Optional[str] = None,
269
- newline_after_output_noun: bool = False,
270
- max_train_instances: int = 5,
271
- num_outputs: int = 1,
272
- max_tokens: int = 5,
273
- stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"]
274
- temperature: float = 0.0,
275
- multi_label: bool = False,
276
- ) -> AdapterSpec:
277
- """
278
- [instructions]
279
-
280
- [input_noun]: [input]
281
- [output_noun]: [output]
282
-
283
- [input_noun]: [input]
284
- [output_noun]:
285
- """
286
-
287
- def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
288
- """
289
- When `append_new_line` is False:
290
- [input_noun]: [input]
291
-
292
- When `append_new_line` is True:
293
- [input_noun]:
294
- [input]
295
- """
296
- prefix: str = f"{noun}:" if noun is not None else ""
297
- if len(prefix) > 0:
298
- prefix += "\n" if append_new_line else " "
299
- return prefix
300
-
301
- if stop_sequences is None:
302
- stop_sequences = ["\n"]
303
-
304
- return AdapterSpec(
305
- method=ADAPT_GENERATION,
306
- instructions=format_instructions(instructions),
307
- input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
308
- input_suffix="\n",
309
- output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
310
- output_suffix="\n",
311
- max_train_instances=max_train_instances,
312
- num_outputs=num_outputs,
313
- max_tokens=max_tokens,
314
- temperature=temperature,
315
- stop_sequences=stop_sequences,
316
- multi_label=multi_label,
317
- )
318
-
319
-
320
- def get_instruct_adapter_spec(
321
- num_outputs: int = 1,
322
- max_tokens: int = 512,
323
- temperature: float = 0.7,
324
- ) -> AdapterSpec:
325
- """
326
- Zero-shot instruction-following.
327
- """
328
- return AdapterSpec(
329
- method=ADAPT_GENERATION,
330
- instructions="",
331
- input_prefix="",
332
- input_suffix="\n",
333
- output_prefix="",
334
- output_suffix="",
335
- max_train_instances=0,
336
- num_outputs=num_outputs,
337
- max_tokens=max_tokens,
338
- temperature=temperature,
339
- stop_sequences=[],
340
- )
341
-
342
-
343
- def get_language_modeling_adapter_spec() -> AdapterSpec:
344
- """
345
- Used for language modeling.
346
- """
347
- return AdapterSpec(
348
- method=ADAPT_LANGUAGE_MODELING,
349
- instructions="",
350
- input_prefix="",
351
- input_suffix="",
352
- output_prefix="",
353
- output_suffix="",
354
- max_train_instances=0,
355
- num_outputs=1,
356
- max_tokens=0,
357
- temperature=0.0,
358
- )
359
-
360
-
361
- def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
362
- """
363
- Used for summarization.
364
- """
365
-
366
- if num_sents == 1:
367
- out_pref = "Summarize the above article in 1 sentence.\n"
368
- elif num_sents is None:
369
- out_pref = "Summarize the above article.\n"
370
- else:
371
- out_pref = f"Summarize the above article in {num_sents} sentences.\n"
372
-
373
- return AdapterSpec(
374
- method=ADAPT_GENERATION,
375
- instructions="",
376
- input_prefix="###\nArticle: ",
377
- input_suffix="\n\n",
378
- output_prefix=out_pref,
379
- output_suffix="\n",
380
- max_train_instances=max_train_instances,
381
- num_outputs=1,
382
- stop_sequences=["###"], # Separator between few-shot instances.
383
- **kwargs,
384
- )
385
-
386
-
387
- def get_machine_translation_adapter_spec(
388
- source_language, target_language, max_train_instances, **kwargs
389
- ) -> AdapterSpec:
390
- """
391
- Used for machine translation.
392
- """
393
- return AdapterSpec(
394
- method=ADAPT_GENERATION,
395
- instructions=f"Translate {source_language} to {target_language}:",
396
- input_prefix="",
397
- input_suffix=" = ",
398
- output_prefix="",
399
- output_suffix="\n",
400
- max_train_instances=max_train_instances,
401
- num_outputs=1,
402
- stop_sequences=["\n\n"],
403
- temperature=0.0,
404
- **kwargs,
405
- )
406
-
407
-
408
- ############################################################
409
- # Examples of scenario and adapter specs
410
-
411
-
412
- def get_scenario_spec1() -> ScenarioSpec:
413
- return ScenarioSpec(
414
- class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
415
- args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 10, "num_test_instances": 10},
416
- )
417
-
418
-
419
- def get_scenario_spec_tiny():
420
- return ScenarioSpec(
421
- class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
422
- args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2},
423
- )
424
-
425
-
426
- def get_adapter_spec1() -> AdapterSpec:
427
- return AdapterSpec(
428
- method=ADAPT_GENERATION,
429
- instructions="Please solve the following problem.\n",
430
- max_train_instances=5,
431
- max_eval_instances=10,
432
- num_outputs=3,
433
- num_train_trials=3,
434
- model="simple/model1",
435
- temperature=1,
436
- stop_sequences=["."],
437
- )
438
-
439
-
440
- ############################################################
441
- # Metrics
442
-
443
-
444
- def get_basic_metric_specs(names: List[str]) -> List[MetricSpec]:
445
- return [MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.BasicMetric", args={"names": names})]
446
-
447
-
448
- def get_exact_match_metric_specs() -> List[MetricSpec]:
449
- return get_basic_metric_specs(
450
- ["exact_match", "quasi_exact_match", "prefix_exact_match", "quasi_prefix_exact_match"]
451
- )
452
-
453
-
454
- def get_f1_metric_specs() -> List[MetricSpec]:
455
- return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
456
-
457
-
458
- def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
459
- return [
460
- MetricSpec(
461
- class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
462
- args={"delimiter": delimiter},
463
- )
464
- ]
465
-
466
-
467
- def get_multiple_choice_classification_metric_specs() -> List[MetricSpec]:
468
- return [
469
- MetricSpec(
470
- class_name="helm.benchmark.metrics.classification_metrics.MultipleChoiceClassificationMetric", args={}
471
- )
472
- ]
473
-
474
-
475
- def get_bbq_metric_specs() -> List[MetricSpec]:
476
- return [
477
- MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
478
- ] + get_exact_match_metric_specs()
479
-
480
-
481
- def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[MetricSpec]:
482
- # Names of the measures we want to compute.
483
- measure_names = MSMARCOScenario.MEASURE_NAMES[track]
484
- multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
485
-
486
- return [
487
- MetricSpec(
488
- class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
489
- args={
490
- "method": ADAPT_RANKING_BINARY,
491
- "measure_names": measure_names,
492
- "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
493
- "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
494
- "rank": rank,
495
- "multiple_relevance_values": multiple_relevance_values,
496
- },
497
- ),
498
- ] + get_basic_metric_specs(names=[])
499
-
500
-
501
- def get_toxicity_metric_specs() -> List[MetricSpec]:
502
- return [
503
- MetricSpec(class_name="helm.benchmark.metrics.toxicity_metrics.ToxicityMetric", args={}),
504
- ]
505
-
506
-
507
- def get_bias_metric_specs() -> List[MetricSpec]:
508
- demographic_categories = ["race", "gender"]
509
- target_categories = ["adjective", "profession"]
510
- cross_dem_target = itertools.product(demographic_categories, target_categories)
511
-
512
- return [
513
- MetricSpec(
514
- class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
515
- args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
516
- )
517
- for dem, tgt in cross_dem_target
518
- ] + [
519
- MetricSpec(
520
- class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
521
- args={"mode": "representation", "demographic_category": dem},
522
- )
523
- for dem in demographic_categories
524
- ]
525
-
526
-
527
- def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
528
- return (
529
- get_bias_metric_specs()
530
- + get_toxicity_metric_specs()
531
- + (get_basic_metric_specs([]) if include_basic_metrics else [])
532
- )
533
-
534
-
535
- def get_summarization_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
536
- return [
537
- MetricSpec(class_name="helm.benchmark.metrics.summarization_metrics.SummarizationMetric", args=args)
538
- ] + get_basic_metric_specs([])
539
-
540
-
541
- def get_summarization_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
542
- return [
543
- MetricSpec(
544
- class_name="helm.benchmark.metrics.summarization_critique_metrics.SummarizationCritiqueMetric",
545
- args={"num_respondents": num_respondents},
546
- )
547
- ]
548
-
549
-
550
- def get_srn_metric_specs() -> List[MetricSpec]:
551
- return get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
552
-
553
-
554
- def get_numeracy_metric_specs(run_solver: bool = False) -> List[MetricSpec]:
555
- metric_specs: List[MetricSpec] = get_basic_metric_specs(
556
- ["exact_match", "quasi_exact_match", "absolute_value_difference"]
557
- )
558
-
559
- # The solvers are slow to run so make them skippable
560
- if run_solver:
561
- metric_specs += [
562
- MetricSpec(class_name="helm.benchmark.metrics.numeracy_metrics.DistanceMetric", args={}),
563
- ]
564
- return metric_specs
565
-
566
-
567
- def get_math_metric_specs(use_chain_of_thought: bool = True) -> List[MetricSpec]:
568
- return get_basic_metric_specs(["math_equiv_chain_of_thought" if use_chain_of_thought else "math_equiv"])
569
-
570
-
571
- def get_copyright_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
572
- if args is None:
573
- args = {}
574
- return [
575
- MetricSpec(
576
- class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
577
- args={**args, "name": "longest_common_prefix_length"},
578
- ),
579
- MetricSpec(
580
- class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
581
- args={**args, "name": "edit_distance"},
582
- ),
583
- MetricSpec(
584
- class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
585
- args={**args, "name": "edit_similarity"},
586
- ),
587
- ] + get_basic_metric_specs([])
588
-
589
-
590
- def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
591
- if args is None:
592
- args = {}
593
- return [
594
- MetricSpec(
595
- class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}
596
- ),
597
- MetricSpec(
598
- class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}
599
- ),
600
- MetricSpec(
601
- class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric",
602
- args={"name": "monte_carlo_entropy"},
603
- ),
604
- ] + get_basic_metric_specs([])
605
-
606
-
607
- def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
608
- if dataset == "humaneval":
609
- return get_basic_metric_specs(["code_eval_acc", "pass"])
610
- else: # APPS.
611
- args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
612
- return [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
613
-
614
-
615
- def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
616
- return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
617
-
618
-
619
- def get_machine_translation_metric_specs() -> List[MetricSpec]:
620
- return [
621
- MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric", args={})
622
- ] + get_basic_metric_specs([])
623
-
624
-
625
- def get_cleva_machine_translation_metric_specs() -> List[MetricSpec]:
626
- return [
627
- MetricSpec(
628
- class_name="helm.benchmark.metrics.machine_translation_metrics.CLEVAMachineTranslationMetric", args={}
629
- )
630
- ] + get_basic_metric_specs([])
631
-
632
-
633
- def get_cleva_paraphrase_generation_metric_specs(alpha: float = 0.8) -> List[MetricSpec]:
634
- return [
635
- MetricSpec(
636
- class_name="helm.benchmark.metrics.paraphrase_generation_metrics.CLEVAParaphraseGenerationMetric",
637
- args={"alpha": alpha}, # calculate iBLEU_0.8 by default
638
- )
639
- ] + get_basic_metric_specs([])
640
-
641
-
642
- def get_verifiability_judgment_metric_specs() -> List[MetricSpec]:
643
- return get_basic_metric_specs(["exact_match", "quasi_exact_match"])
644
-
645
-
646
- def get_instruction_following_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
647
- return [
648
- MetricSpec(
649
- class_name="helm.benchmark.metrics.instruction_following_critique_metrics.InstructionFollowingCritiqueMetric", # noqa E501
650
- args={"num_respondents": num_respondents},
651
- )
652
- ]
653
-
654
-
655
- def get_cleva_topk_accuracy_metric_specs(k: int = 1, cut_off: int = 5) -> List[MetricSpec]:
656
- return [
657
- MetricSpec(
658
- class_name="helm.benchmark.metrics.cleva_accuracy_metrics.CLEVATopKAccuracyMetric",
659
- args={"k": k, "cut_off": cut_off},
660
- )
661
- ]
662
-
663
-
664
- def get_cleva_bias_metric_specs() -> List[MetricSpec]:
665
- demographic_categories = ["race", "gender"]
666
- target_categories = ["adjective", "profession"]
667
- cross_dem_target = itertools.product(demographic_categories, target_categories)
668
-
669
- return [
670
- MetricSpec(
671
- class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
672
- args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
673
- )
674
- for dem, tgt in cross_dem_target
675
- ] + [
676
- MetricSpec(
677
- class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
678
- args={"mode": "representation", "demographic_category": dem},
679
- )
680
- for dem in demographic_categories
681
- ]
682
-
683
-
684
- def get_cleva_toxicity_metric_specs() -> List[MetricSpec]:
685
- return [
686
- MetricSpec(class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVAToxicityMetric", args={}),
687
- ]
688
-
689
-
690
- def get_cleva_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
691
- return (
692
- get_cleva_bias_metric_specs()
693
- + get_cleva_toxicity_metric_specs()
694
- + (get_basic_metric_specs([]) if include_basic_metrics else [])
695
- )
696
-
697
-
698
- def get_cleva_copyright_metric_spec(args: Optional[Dict] = None) -> List[MetricSpec]:
699
- if args is None:
700
- args = {}
701
- return [
702
- MetricSpec(
703
- class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
704
- args={**args, "name": "longest_common_prefix_length"},
705
- ),
706
- MetricSpec(
707
- class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
708
- args={**args, "name": "edit_distance"},
709
- ),
710
- MetricSpec(
711
- class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
712
- args={**args, "name": "edit_similarity"},
713
- ),
714
- ]
715
-
716
-
717
- def get_cleva_generative_task_metric_spec(task: str, subtask: Optional[str], **kwargs) -> List[MetricSpec]:
718
- CLEVA_GEN_TASK_TO_METRIC: Dict[str, Callable] = {
719
- "opinion_mining:opinion_target_extraction": get_exact_match_metric_specs,
720
- "paraphrase_generation": get_cleva_paraphrase_generation_metric_specs,
721
- "closed_book_question_answering:generative_question_answering": get_exact_match_metric_specs,
722
- "conceptual_generalization": get_cleva_topk_accuracy_metric_specs,
723
- "translation:en2zh": get_cleva_machine_translation_metric_specs,
724
- "translation:zh2en": get_cleva_machine_translation_metric_specs,
725
- "mathematical_calculation:add": get_exact_match_metric_specs,
726
- "mathematical_calculation:sub": get_exact_match_metric_specs,
727
- "mathematical_calculation:mul": get_exact_match_metric_specs,
728
- "inductive_reasoning:add": get_exact_match_metric_specs,
729
- "inductive_reasoning:sub": get_exact_match_metric_specs,
730
- "inductive_reasoning:mul": get_exact_match_metric_specs,
731
- "reasoning_primitive:dyck_language": get_exact_match_metric_specs,
732
- "reasoning_primitive:pattern_induction": get_exact_match_metric_specs,
733
- "reasoning_primitive:pattern_matching": get_exact_match_metric_specs,
734
- "reasoning_primitive:variable_sub": get_exact_match_metric_specs,
735
- "subject_knowledge:art": get_exact_match_metric_specs,
736
- "subject_knowledge:biomedicine": get_exact_match_metric_specs,
737
- "subject_knowledge:chemistry": get_exact_match_metric_specs,
738
- "subject_knowledge:computer_science": get_exact_match_metric_specs,
739
- "subject_knowledge:economics": get_exact_match_metric_specs,
740
- "subject_knowledge:geography": get_exact_match_metric_specs,
741
- "subject_knowledge:history": get_exact_match_metric_specs,
742
- "subject_knowledge:law": get_exact_match_metric_specs,
743
- "subject_knowledge:literature": get_exact_match_metric_specs,
744
- "subject_knowledge:math": get_exact_match_metric_specs,
745
- "subject_knowledge:other_general": get_exact_match_metric_specs,
746
- "subject_knowledge:philosophy": get_exact_match_metric_specs,
747
- "subject_knowledge:physics": get_exact_match_metric_specs,
748
- "subject_knowledge:politics": get_exact_match_metric_specs,
749
- "summarization:dialogue_summarization": partial(get_basic_metric_specs, ["chinese_rouge_2"]),
750
- "pinyin_transliteration:pinyin2zh": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
751
- "pinyin_transliteration:zh2pinyin": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
752
- "dialogue_generation:task_oriented": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
753
- "data_to_text_generation": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
754
- "mathematical_reasoning:math_word_problem": partial(get_basic_metric_specs, ["cleva_math_result_match"]),
755
- }
756
-
757
- key: str = task
758
- if subtask is not None:
759
- key += ":" + subtask
760
- return CLEVA_GEN_TASK_TO_METRIC[key](**kwargs)
761
-
762
-
763
- ############################################################
764
- # Run specs
765
-
766
-
767
- CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {}
768
- """Dict of run spec function names to run spec functions."""
769
-
770
-
771
- F = TypeVar("F", bound=Callable[..., RunSpec])
772
-
773
-
774
- def run_spec_function(name: str) -> Callable[[F], F]:
775
- """Register the run spec function under the given name."""
776
-
777
- def wrap(func: F) -> F:
778
- if name in CANONICAL_RUN_SPEC_FUNCS:
779
- raise ValueError(f"A run spec function with name {name} already exists")
780
- CANONICAL_RUN_SPEC_FUNCS[name] = func
781
- return func
782
-
783
- return wrap
784
-
785
-
786
- @run_spec_function("simple1")
787
- def get_simple1_spec() -> RunSpec:
788
- """A run spec for debugging."""
789
- return RunSpec(
790
- name="simple1",
791
- scenario_spec=get_scenario_spec1(),
792
- adapter_spec=get_adapter_spec1(),
793
- metric_specs=get_basic_metric_specs([]),
794
- groups=[],
795
- )
44
+ from helm.benchmark.metrics.metric import MetricSpec
45
+ from helm.benchmark.run_spec import RunSpec, run_spec_function
46
+ from helm.benchmark.runner import get_benchmark_output_path
47
+ from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
48
+ from helm.common.hierarchical_logger import hlog, htrack
796
49
 
797
50
 
798
51
  @run_spec_function("bbq")
@@ -806,7 +59,9 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
806
59
  input_noun="Passage",
807
60
  output_noun="Answer",
808
61
  )
809
- metric_specs = get_bbq_metric_specs()
62
+ metric_specs = [
63
+ MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
64
+ ] + get_exact_match_metric_specs()
810
65
 
811
66
  return RunSpec(
812
67
  name=f"bbq:subject={subject},method={method}",
@@ -819,6 +74,8 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
819
74
 
820
75
  @run_spec_function("msmarco")
821
76
  def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
77
+ from helm.benchmark.scenarios.msmarco_scenario import MSMARCOScenario
78
+
822
79
  valid_topk = None if valid_topk is None else int(valid_topk)
823
80
  scenario_spec = ScenarioSpec(
824
81
  class_name="helm.benchmark.scenarios.msmarco_scenario.MSMARCOScenario",
@@ -827,11 +84,33 @@ def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
827
84
 
828
85
  adapter_spec: AdapterSpec = get_ranking_binary_adapter_spec(max_train_instances=4, stop_sequences=["\n"])
829
86
 
87
+ # Names of the measures we want to compute.
88
+ measure_names = MSMARCOScenario.MEASURE_NAMES[track]
89
+ multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
90
+
91
+ metric_specs = (
92
+ [
93
+ MetricSpec(
94
+ class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
95
+ args={
96
+ "method": ADAPT_RANKING_BINARY,
97
+ "measure_names": measure_names,
98
+ "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
99
+ "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
100
+ "rank": valid_topk,
101
+ "multiple_relevance_values": multiple_relevance_values,
102
+ },
103
+ ),
104
+ ]
105
+ + get_basic_reference_metric_specs()
106
+ + get_generic_metric_specs()
107
+ )
108
+
830
109
  return RunSpec(
831
110
  name=f"msmarco:track={track},valid_topk={valid_topk}",
832
111
  scenario_spec=scenario_spec,
833
112
  adapter_spec=adapter_spec,
834
- metric_specs=get_msmarco_metric_specs(track=track, rank=valid_topk),
113
+ metric_specs=metric_specs,
835
114
  groups=[f"msmarco_{track}"],
836
115
  )
837
116
 
@@ -905,28 +184,6 @@ def get_custom_mcqa_spec(
905
184
  )
906
185
 
907
186
 
908
- @run_spec_function("mmlu")
909
- def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
910
- scenario_spec = ScenarioSpec(
911
- class_name="helm.benchmark.scenarios.mmlu_scenario.MMLUScenario", args={"subject": subject}
912
- )
913
-
914
- adapter_spec = get_multiple_choice_adapter_spec(
915
- method=method,
916
- instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
917
- input_noun="Question",
918
- output_noun="Answer",
919
- )
920
-
921
- return RunSpec(
922
- name=f"mmlu:subject={subject},method={method}",
923
- scenario_spec=scenario_spec,
924
- adapter_spec=adapter_spec,
925
- metric_specs=get_exact_match_metric_specs(),
926
- groups=["mmlu"],
927
- )
928
-
929
-
930
187
  @run_spec_function("interactive_qa_mmlu")
931
188
  def get_interactive_qa_mmlu_spec(subject: str) -> RunSpec:
932
189
  scenario_spec = ScenarioSpec(
@@ -975,29 +232,6 @@ def get_wikifact_spec(k: str, subject: str) -> RunSpec:
975
232
  )
976
233
 
977
234
 
978
- @run_spec_function("commonsense")
979
- def get_commonsense_spec(dataset: str, method: str) -> RunSpec:
980
- scenario_spec = ScenarioSpec(
981
- class_name="helm.benchmark.scenarios.commonsense_scenario.CommonSenseScenario",
982
- args={"dataset": dataset},
983
- )
984
-
985
- adapter_spec = get_multiple_choice_adapter_spec(
986
- method=method,
987
- instructions="The following are multiple choice questions (with answers) about common sense.",
988
- input_noun="Question",
989
- output_noun="Answer",
990
- )
991
-
992
- return RunSpec(
993
- name=f"commonsense:dataset={dataset},method={method}",
994
- scenario_spec=scenario_spec,
995
- adapter_spec=adapter_spec,
996
- metric_specs=get_exact_match_metric_specs(),
997
- groups=[dataset],
998
- )
999
-
1000
-
1001
235
  @run_spec_function("quac")
1002
236
  def get_quac_spec() -> RunSpec:
1003
237
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.quac_scenario.QuACScenario", args={})
@@ -1060,7 +294,7 @@ def get_twitter_aae_spec(demographic: str) -> RunSpec:
1060
294
  name=f"twitter_aae:demographic={demographic}",
1061
295
  scenario_spec=scenario_spec,
1062
296
  adapter_spec=get_language_modeling_adapter_spec(),
1063
- metric_specs=get_basic_metric_specs([]),
297
+ metric_specs=get_language_modeling_metric_specs([]),
1064
298
  groups=["twitter_aae", f"twitter_aae_{demographic}"],
1065
299
  )
1066
300
 
@@ -1088,7 +322,9 @@ def get_real_toxicity_prompts_spec() -> RunSpec:
1088
322
  name="real_toxicity_prompts",
1089
323
  scenario_spec=scenario_spec,
1090
324
  adapter_spec=adapter_spec,
1091
- metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
325
+ metric_specs=get_generative_harms_metric_specs(
326
+ include_basic_metrics=True, include_generative_harms_metrics=True
327
+ ),
1092
328
  groups=["real_toxicity_prompts"],
1093
329
  )
1094
330
 
@@ -1108,46 +344,28 @@ def get_synthetic_reasoning_natural_spec(difficulty: str) -> RunSpec:
1108
344
  max_train_instances=3, # limited by the context length
1109
345
  max_tokens=20,
1110
346
  )
347
+ srn_metric_specs = get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
1111
348
 
1112
349
  return RunSpec(
1113
350
  name=f"synthetic_reasoning_natural:difficulty={difficulty}",
1114
351
  scenario_spec=scenario_spec,
1115
352
  adapter_spec=adapter_spec,
1116
- metric_specs=get_srn_metric_specs() + get_generative_harms_metric_specs(),
353
+ metric_specs=srn_metric_specs + get_generative_harms_metric_specs(),
1117
354
  groups=["synthetic_reasoning", "synthetic_reasoning_natural"],
1118
355
  )
1119
356
 
1120
357
 
1121
- @run_spec_function("gsm")
1122
- def get_gsm_spec() -> RunSpec:
1123
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={})
1124
-
1125
- # Create AdapterSpec based on the GSM8K paper: https://arxiv.org/pdf/2110.14168.pdf
1126
- adapter_spec = get_generation_adapter_spec(
1127
- input_noun="Q",
1128
- output_noun="A",
1129
- max_train_instances=5, # Due to limited context and long example length
1130
- max_tokens=400, # The paper uses 400 tokens as the max sample length
1131
- stop_sequences=["\n\n"], # Since answer may contain newlines, we use two as SEP
1132
- )
1133
-
1134
- return RunSpec(
1135
- name="gsm",
1136
- scenario_spec=scenario_spec,
1137
- adapter_spec=adapter_spec,
1138
- metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(),
1139
- groups=["gsm"],
1140
- )
1141
-
1142
-
1143
358
  @run_spec_function("raft")
1144
359
  def get_raft_spec(subset: str) -> RunSpec:
360
+ from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions
361
+
1145
362
  scenario_spec = ScenarioSpec(
1146
363
  class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset}
1147
364
  )
1148
365
 
366
+ scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name)
1149
367
  adapter_spec = get_generation_adapter_spec(
1150
- instructions=get_raft_instructions(subset),
368
+ instructions=get_raft_instructions(subset, scenario_cache_path),
1151
369
  input_noun=None,
1152
370
  output_noun="Label",
1153
371
  max_tokens=30, # at most ~50 characters per label
@@ -1166,9 +384,10 @@ def get_raft_spec(subset: str) -> RunSpec:
1166
384
  def get_numeracy_spec(
1167
385
  relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
1168
386
  ) -> RunSpec:
1169
- from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
387
+ from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
1170
388
 
1171
- run_solver: bool = True if run_solver == "True" else False # type: ignore
389
+ run_solver_bool: bool = True if run_solver == "True" else False
390
+ del run_solver
1172
391
  random_seed = int(seed)
1173
392
  scenario_spec = ScenarioSpec(
1174
393
  class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
@@ -1208,72 +427,11 @@ def get_numeracy_spec(
1208
427
  name=f"numeracy:relation_type={relation_type},mode={mode}",
1209
428
  scenario_spec=scenario_spec,
1210
429
  adapter_spec=adapter_spec,
1211
- metric_specs=get_numeracy_metric_specs(run_solver), # type: ignore
430
+ metric_specs=get_numeracy_metric_specs(run_solver_bool),
1212
431
  groups=["numeracy"],
1213
432
  )
1214
433
 
1215
434
 
1216
- @run_spec_function("math")
1217
- def get_math_spec(
1218
- subject: str,
1219
- level: str,
1220
- use_official_examples: str = "False",
1221
- use_chain_of_thought: str = "False",
1222
- ) -> RunSpec:
1223
- use_official_examples: bool = use_official_examples == "True" # type: ignore
1224
- use_chain_of_thought: bool = use_chain_of_thought == "True" # type: ignore
1225
- if use_chain_of_thought:
1226
- assert not use_official_examples, "Cannot use official examples when use_chain_of_thought is True."
1227
- scenario_spec = ScenarioSpec(
1228
- class_name="helm.benchmark.scenarios.math_scenario.MATHScenario",
1229
- args={
1230
- "subject": subject,
1231
- "level": level,
1232
- "use_official_examples": use_official_examples,
1233
- "use_chain_of_thought": use_chain_of_thought,
1234
- },
1235
- )
1236
-
1237
- if use_chain_of_thought: # Include the solution in the output as per https://arxiv.org/abs/2201.11903
1238
- output_prefix = "Answer: " # Don't include LaTeX '$' delimiters
1239
- output_suffix = "\n"
1240
- instance_prefix = "###\n" # Don't include LaTeX '$' delimiters
1241
- max_tokens = 400 # Increase the number of tokens to generate
1242
- stop_sequences = ["###"] # Break at the next instance; extraneous output will be stripped out
1243
- groups = ["math_chain_of_thought"]
1244
- else:
1245
- output_prefix = "Answer: $"
1246
- output_suffix = "$\n"
1247
- instance_prefix = "###\n"
1248
- max_tokens = 20
1249
- stop_sequences = ["$"] # Break at the nearest LaTeX closing delimiter
1250
- groups = ["math_regular"]
1251
-
1252
- adapter_spec = AdapterSpec(
1253
- method=ADAPT_GENERATION,
1254
- instructions="Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\n",
1255
- max_train_instances=8,
1256
- num_outputs=1,
1257
- temperature=0.0,
1258
- stop_sequences=stop_sequences,
1259
- max_tokens=max_tokens,
1260
- input_prefix="Problem: ",
1261
- input_suffix="\n",
1262
- output_prefix=output_prefix,
1263
- output_suffix=output_suffix,
1264
- instance_prefix=instance_prefix,
1265
- )
1266
-
1267
- return RunSpec(
1268
- name=f"math:subject={subject},level={level},"
1269
- f"use_official_examples={use_official_examples},use_chain_of_thought={use_chain_of_thought}",
1270
- scenario_spec=scenario_spec,
1271
- adapter_spec=adapter_spec,
1272
- metric_specs=get_math_metric_specs(use_chain_of_thought) + get_generative_harms_metric_specs(), # type: ignore
1273
- groups=groups,
1274
- )
1275
-
1276
-
1277
435
  @run_spec_function("boolq")
1278
436
  def get_boolq_spec(only_contrast=False) -> RunSpec:
1279
437
  scenario_spec = ScenarioSpec(
@@ -1358,6 +516,8 @@ def get_copyright_spec(
1358
516
  normalize_by_prefix_length=True,
1359
517
  normalize_newline_space_tab=False,
1360
518
  ) -> RunSpec:
519
+ from helm.benchmark.scenarios.copyright_scenario import datatag2hash_code
520
+
1361
521
  scenario_spec = ScenarioSpec(
1362
522
  class_name="helm.benchmark.scenarios.copyright_scenario.CopyrightScenario", args=dict(datatag=datatag)
1363
523
  )
@@ -1470,36 +630,21 @@ def get_code_spec(dataset: str, timeout=3) -> RunSpec:
1470
630
  max_tokens=600,
1471
631
  )
1472
632
 
633
+ if dataset == "humaneval":
634
+ code_metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"])
635
+ else: # APPS.
636
+ args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
637
+ code_metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
638
+
1473
639
  return RunSpec(
1474
640
  name=f"code:dataset={dataset}",
1475
641
  scenario_spec=scenario_spec,
1476
642
  adapter_spec=adapter_spec,
1477
- metric_specs=get_code_metric_specs(dataset, timeout) + get_generative_harms_metric_specs(),
643
+ metric_specs=code_metric_specs + get_generative_harms_metric_specs(),
1478
644
  groups=[f"code_{dataset}"],
1479
645
  )
1480
646
 
1481
647
 
1482
- @run_spec_function("natural_qa")
1483
- def get_natural_qa_spec(mode: str) -> RunSpec:
1484
- scenario_spec = ScenarioSpec(
1485
- class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={"mode": mode}
1486
- )
1487
-
1488
- adapter_spec = get_generation_adapter_spec(
1489
- input_noun="Question" if mode == "closedbook" else None,
1490
- output_noun="Answer",
1491
- max_tokens=300, # answers are at most 65 words
1492
- )
1493
-
1494
- return RunSpec(
1495
- name=f"natural_qa:mode={mode}",
1496
- scenario_spec=scenario_spec,
1497
- adapter_spec=adapter_spec,
1498
- metric_specs=get_f1_metric_specs() + get_generative_harms_metric_specs(),
1499
- groups=[f"natural_qa_{mode}"],
1500
- )
1501
-
1502
-
1503
648
  @run_spec_function("the_pile")
1504
649
  def get_the_pile_spec(subset: str) -> RunSpec:
1505
650
  scenario_spec = ScenarioSpec(
@@ -1510,7 +655,7 @@ def get_the_pile_spec(subset: str) -> RunSpec:
1510
655
  name=f"the_pile:subset={subset}",
1511
656
  scenario_spec=scenario_spec,
1512
657
  adapter_spec=get_language_modeling_adapter_spec(),
1513
- metric_specs=get_basic_metric_specs([]),
658
+ metric_specs=get_language_modeling_metric_specs([]),
1514
659
  groups=["the_pile"],
1515
660
  )
1516
661
 
@@ -1523,32 +668,11 @@ def get_ice_spec(**kwargs) -> RunSpec:
1523
668
  name="ice" + (":" if len(kwargs) > 0 else "") + ",".join(f"{k}={v}" for k, v in sorted(kwargs.items())),
1524
669
  scenario_spec=scenario_spec,
1525
670
  adapter_spec=get_language_modeling_adapter_spec(),
1526
- metric_specs=get_basic_metric_specs([]),
671
+ metric_specs=get_language_modeling_metric_specs([]),
1527
672
  groups=["ice"],
1528
673
  )
1529
674
 
1530
675
 
1531
- @run_spec_function("narrative_qa")
1532
- def get_narrativeqa_spec() -> RunSpec:
1533
- scenario_spec = ScenarioSpec(
1534
- class_name="helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario", args={}
1535
- )
1536
-
1537
- adapter_spec = get_generation_adapter_spec(
1538
- input_noun="Passage",
1539
- output_noun="Answer",
1540
- max_tokens=100, # max 30 words
1541
- )
1542
-
1543
- return RunSpec(
1544
- name="narrative_qa",
1545
- scenario_spec=scenario_spec,
1546
- adapter_spec=adapter_spec,
1547
- metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
1548
- groups=["narrative_qa"],
1549
- )
1550
-
1551
-
1552
676
  @run_spec_function("synthetic_efficiency")
1553
677
  def get_synthetic_efficiency_spec(
1554
678
  num_prompt_tokens: Optional[int] = None,
@@ -1570,7 +694,9 @@ def get_synthetic_efficiency_spec(
1570
694
  name=f"synthetic_efficiency:random={random}",
1571
695
  scenario_spec=scenario_spec,
1572
696
  adapter_spec=adapter_spec,
1573
- metric_specs=get_basic_metric_specs(["exact_match"]) + get_generative_harms_metric_specs(),
697
+ metric_specs=get_basic_generation_metric_specs(["exact_match"])
698
+ + get_generic_metric_specs()
699
+ + get_generative_harms_metric_specs(),
1574
700
  groups=["synthetic_efficiency"],
1575
701
  )
1576
702
 
@@ -1609,7 +735,7 @@ def get_wikitext_103_spec() -> RunSpec:
1609
735
  name="wikitext_103",
1610
736
  scenario_spec=scenario_spec,
1611
737
  adapter_spec=get_language_modeling_adapter_spec(),
1612
- metric_specs=get_basic_metric_specs([]),
738
+ metric_specs=get_language_modeling_metric_specs([]),
1613
739
  groups=["wikitext_103"],
1614
740
  )
1615
741
 
@@ -1757,7 +883,9 @@ def get_dyck_language_spec(num_parenthesis_pairs: int) -> RunSpec:
1757
883
  name=f"dyck_language_np={int(num_parenthesis_pairs)}",
1758
884
  scenario_spec=scenario_spec,
1759
885
  adapter_spec=adapter_spec,
1760
- metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(),
886
+ metric_specs=get_basic_generation_metric_specs(["exact_match_indicator"])
887
+ + get_generic_metric_specs()
888
+ + get_generative_harms_metric_specs(),
1761
889
  groups=["dyck_language"],
1762
890
  )
1763
891
 
@@ -1827,6 +955,8 @@ def get_entity_data_imputation_spec(dataset: str) -> RunSpec:
1827
955
  @htrack("Extracting adaptation parameters from the BIG-bench task definition and building the RunSpec")
1828
956
  @run_spec_function("big_bench")
1829
957
  def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
958
+ from helm.benchmark.scenarios.big_bench_scenario import BIGBenchScenario
959
+
1830
960
  def get_adaptation_method(big_bench_metrics: List[str]) -> str:
1831
961
  """
1832
962
  From BIG-bench, "there are three types of BIG-bench JSON tasks - generative and scoring
@@ -1871,16 +1001,14 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
1871
1001
  )
1872
1002
 
1873
1003
  # Get BIG-bench task definition.
1874
- # TODO: get `output_path` here without hardcoding
1875
- output_path: str = "benchmark_output/scenarios/big_bench"
1876
- big_bench_task: Dict = BIGBenchScenario.download_and_get_task(output_path, task, subtask)
1004
+ scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), BIGBenchScenario.name)
1005
+ big_bench_task: Dict = BIGBenchScenario.download_and_get_task(scenario_cache_path, task, subtask)
1877
1006
 
1878
1007
  # The JSON schema for BIG-bench can be found here:
1879
1008
  # https://github.com/google/BIG-bench/blob/main/docs/doc.md#json-schema.
1880
1009
  # "metrics" is a required field. The default values were populated using the link above.
1881
1010
  adapter_spec = AdapterSpec(
1882
1011
  method=get_adaptation_method(big_bench_task["metrics"]),
1883
- model="openai/text-curie-001", # Can override with the `ModelRunExpander`.
1884
1012
  max_train_instances=5, # Can override with the `MaxTrainInstancesRunExpander`.
1885
1013
  num_outputs=1, # Can override with the `NumOutputsRunExpander`.
1886
1014
  # From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
@@ -1907,9 +1035,8 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
1907
1035
  name=run_spec_name,
1908
1036
  scenario_spec=scenario_spec,
1909
1037
  adapter_spec=adapter_spec,
1910
- # TODO add generative harms when applicable
1911
1038
  metric_specs=get_metric_specs(big_bench_task["metrics"]),
1912
- groups=["BIG-bench"],
1039
+ groups=[f"big_bench_{task}"],
1913
1040
  )
1914
1041
 
1915
1042
 
@@ -1991,7 +1118,7 @@ def get_med_mcqa_spec() -> RunSpec:
1991
1118
  scenario_spec=scenario_spec,
1992
1119
  adapter_spec=adapter_spec,
1993
1120
  metric_specs=get_exact_match_metric_specs(),
1994
- groups=["MedMCQA"],
1121
+ groups=["med_mcqa"],
1995
1122
  )
1996
1123
 
1997
1124
 
@@ -2017,48 +1144,86 @@ def get_med_paragraph_simplification_spec() -> RunSpec:
2017
1144
  )
2018
1145
 
2019
1146
 
2020
- @run_spec_function("med_qa")
2021
- def get_med_qa_spec() -> RunSpec:
2022
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
1147
+ @run_spec_function("pubmed_qa")
1148
+ def get_pubmed_qa_spec() -> RunSpec:
1149
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
2023
1150
 
2024
1151
  adapter_spec = get_multiple_choice_adapter_spec(
2025
1152
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
2026
- instructions="Give a letter answer among A, B, C or D.",
1153
+ instructions="Answer A for yes, B for no or C for maybe.",
2027
1154
  input_noun="Question",
2028
1155
  output_noun="Answer",
2029
1156
  )
2030
1157
 
2031
1158
  return RunSpec(
2032
- name="med_qa",
1159
+ name="pubmed_qa",
2033
1160
  scenario_spec=scenario_spec,
2034
1161
  adapter_spec=adapter_spec,
2035
1162
  metric_specs=get_exact_match_metric_specs(),
2036
- groups=["MedQA"],
1163
+ groups=["pubmed_qa"],
2037
1164
  )
2038
1165
 
2039
1166
 
2040
- @run_spec_function("pubmed_qa")
2041
- def get_pubmed_qa_spec() -> RunSpec:
2042
- scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
1167
+ @run_spec_function("live_qa")
1168
+ def get_live_qa_spec() -> RunSpec:
1169
+ from helm.common.gpu_utils import get_torch_device_name
2043
1170
 
2044
- adapter_spec = get_multiple_choice_adapter_spec(
2045
- method=ADAPT_MULTIPLE_CHOICE_JOINT,
2046
- instructions="Answer A for yes, B for no or C for maybe.",
1171
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
1172
+
1173
+ adapter_spec = get_generation_adapter_spec(
1174
+ instructions="Please answer the following consumer health question.",
2047
1175
  input_noun="Question",
2048
1176
  output_noun="Answer",
1177
+ max_train_instances=0,
1178
+ max_tokens=512,
2049
1179
  )
2050
1180
 
2051
1181
  return RunSpec(
2052
- name="pubmed_qa",
1182
+ name="live_qa",
2053
1183
  scenario_spec=scenario_spec,
2054
1184
  adapter_spec=adapter_spec,
2055
- metric_specs=get_exact_match_metric_specs(),
2056
- groups=["pubmed_qa"],
1185
+ metric_specs=get_summarization_metric_specs(
1186
+ {"task": "live_qa", "device": get_torch_device_name()},
1187
+ ),
1188
+ groups=["live_qa"],
1189
+ )
1190
+
1191
+
1192
+ @run_spec_function("medication_qa")
1193
+ def get_medication_qa_spec() -> RunSpec:
1194
+ from helm.common.gpu_utils import get_torch_device_name
1195
+
1196
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
1197
+
1198
+ adapter_spec = get_generation_adapter_spec(
1199
+ instructions="Please answer the following consumer health question.",
1200
+ input_noun="Question",
1201
+ output_noun="Answer",
1202
+ max_train_instances=0,
1203
+ max_tokens=512,
1204
+ )
1205
+
1206
+ return RunSpec(
1207
+ name="medication_qa",
1208
+ scenario_spec=scenario_spec,
1209
+ adapter_spec=adapter_spec,
1210
+ metric_specs=get_summarization_metric_specs(
1211
+ {"task": "medication_qa", "device": get_torch_device_name()},
1212
+ ),
1213
+ groups=["medication_qa"],
2057
1214
  )
2058
1215
 
2059
1216
 
2060
1217
  @run_spec_function("lextreme")
2061
1218
  def get_lextreme_spec(subset: str) -> RunSpec:
1219
+ from helm.benchmark.scenarios.lextreme_scenario import (
1220
+ get_lextreme_instructions,
1221
+ get_lextreme_max_train_instances,
1222
+ get_lextreme_max_tokens,
1223
+ TaskType,
1224
+ get_lextreme_task_type,
1225
+ )
1226
+
2062
1227
  task_type = get_lextreme_task_type(subset)
2063
1228
 
2064
1229
  scenario_spec = ScenarioSpec(
@@ -2075,7 +1240,7 @@ def get_lextreme_spec(subset: str) -> RunSpec:
2075
1240
  multi_label=(task_type == TaskType.MLTC),
2076
1241
  )
2077
1242
 
2078
- metric_specs = get_basic_metric_specs([])
1243
+ metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
2079
1244
  if task_type == TaskType.MLTC:
2080
1245
  metric_specs += get_classification_metric_specs(delimiter=", ")
2081
1246
  elif task_type == TaskType.SLTC:
@@ -2092,6 +1257,14 @@ def get_lextreme_spec(subset: str) -> RunSpec:
2092
1257
 
2093
1258
  @run_spec_function("lex_glue")
2094
1259
  def get_lex_glue_spec(subset: str) -> RunSpec:
1260
+ from helm.benchmark.scenarios.lex_glue_scenario import (
1261
+ get_lex_glue_instructions,
1262
+ get_lex_glue_max_tokens,
1263
+ get_lex_glue_max_train_instances,
1264
+ get_lex_glue_task_type,
1265
+ )
1266
+ from helm.benchmark.scenarios.lextreme_scenario import TaskType
1267
+
2095
1268
  task_type = get_lex_glue_task_type(subset)
2096
1269
 
2097
1270
  scenario_spec = ScenarioSpec(
@@ -2108,7 +1281,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec:
2108
1281
  multi_label=(task_type == TaskType.MLTC),
2109
1282
  )
2110
1283
 
2111
- metric_specs = get_basic_metric_specs([])
1284
+ metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
2112
1285
  if task_type == TaskType.MLTC:
2113
1286
  metric_specs += get_classification_metric_specs(delimiter=", ")
2114
1287
  elif task_type == TaskType.SLTC:
@@ -2207,92 +1380,6 @@ def get_eurlexsum_legal_summarization_spec(temperature: float = 0.3, device: str
2207
1380
  )
2208
1381
 
2209
1382
 
2210
- @run_spec_function("wmt_14")
2211
- def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec:
2212
- FULL_LANGUAGE_NAMES = {
2213
- "cs": "Czech",
2214
- "de": "German",
2215
- "fr": "French",
2216
- "hi": "Hindi",
2217
- "ru": "Russian",
2218
- "en": "English",
2219
- }
2220
- source_language, target_language = language_pair.split("-")
2221
-
2222
- scenario_spec = ScenarioSpec(
2223
- class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario",
2224
- args={"source_language": source_language, "target_language": target_language},
2225
- )
2226
-
2227
- adapter_spec = get_machine_translation_adapter_spec(
2228
- source_language=FULL_LANGUAGE_NAMES[source_language],
2229
- target_language=FULL_LANGUAGE_NAMES[target_language],
2230
- max_train_instances=max_train_instances,
2231
- )
2232
-
2233
- return RunSpec(
2234
- name=f"wmt_14:language_pair={language_pair}",
2235
- scenario_spec=scenario_spec,
2236
- adapter_spec=adapter_spec,
2237
- metric_specs=get_machine_translation_metric_specs(),
2238
- groups=["wmt_14"],
2239
- )
2240
-
2241
-
2242
- @run_spec_function("self_instruct")
2243
- def get_self_instruct_spec(num_respondents: int) -> RunSpec:
2244
- scenario_spec = ScenarioSpec(
2245
- class_name="helm.benchmark.scenarios.self_instruct_scenario.SelfInstructScenario",
2246
- args={},
2247
- )
2248
-
2249
- adapter_spec = get_instruct_adapter_spec()
2250
-
2251
- return RunSpec(
2252
- name="self_instruct",
2253
- scenario_spec=scenario_spec,
2254
- adapter_spec=adapter_spec,
2255
- metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
2256
- groups=["self_instruct"],
2257
- )
2258
-
2259
-
2260
- @run_spec_function("vicuna")
2261
- def get_vicuna_spec(num_respondents: int, category: str = "all") -> RunSpec:
2262
- scenario_spec = ScenarioSpec(
2263
- class_name="helm.benchmark.scenarios.vicuna_scenario.VicunaScenario",
2264
- args={"category": category},
2265
- )
2266
-
2267
- adapter_spec = get_instruct_adapter_spec()
2268
-
2269
- return RunSpec(
2270
- name=f"vicuna:category={category}", # TODO: add args
2271
- scenario_spec=scenario_spec,
2272
- adapter_spec=adapter_spec,
2273
- metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
2274
- groups=["vicuna"],
2275
- )
2276
-
2277
-
2278
- @run_spec_function("grammar")
2279
- def get_grammar_spec(num_respondents: int, path: str, tags: str) -> RunSpec:
2280
- scenario_spec = ScenarioSpec(
2281
- class_name="helm.benchmark.scenarios.grammar_scenario.GrammarScenario",
2282
- args={"path": path, "tags": tags},
2283
- )
2284
-
2285
- adapter_spec = get_instruct_adapter_spec()
2286
-
2287
- return RunSpec(
2288
- name=f"grammar:path={path},tags={tags}",
2289
- scenario_spec=scenario_spec,
2290
- adapter_spec=adapter_spec,
2291
- metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
2292
- groups=["grammar"],
2293
- )
2294
-
2295
-
2296
1383
  @run_spec_function("verifiability_judgment")
2297
1384
  def get_verifiability_judgment_spec() -> RunSpec:
2298
1385
  scenario_spec = ScenarioSpec(
@@ -2315,7 +1402,7 @@ def get_verifiability_judgment_spec() -> RunSpec:
2315
1402
  name="verifiability_judgment",
2316
1403
  scenario_spec=scenario_spec,
2317
1404
  adapter_spec=adapter_spec,
2318
- metric_specs=get_verifiability_judgment_metric_specs(),
1405
+ metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match"]),
2319
1406
  groups=["verifiability_judgment"],
2320
1407
  )
2321
1408
 
@@ -2355,269 +1442,69 @@ def get_opinions_qa_spec(
2355
1442
  )
2356
1443
 
2357
1444
 
2358
- @run_spec_function("open_assistant")
2359
- def get_open_assistant_spec(num_respondents: int, language: str) -> RunSpec:
2360
- scenario_spec = ScenarioSpec(
2361
- class_name="helm.benchmark.scenarios.open_assistant_scenario.OpenAssistantScenario",
2362
- args={"language": language},
2363
- )
2364
-
2365
- adapter_spec = get_instruct_adapter_spec()
2366
-
2367
- return RunSpec(
2368
- name=f"open_assistant:language={language}",
2369
- scenario_spec=scenario_spec,
2370
- adapter_spec=adapter_spec,
2371
- metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
2372
- groups=["open_assistant"],
2373
- )
2374
-
2375
-
2376
- @run_spec_function("koala")
2377
- def get_koala_spec(num_respondents: int) -> RunSpec:
1445
+ @run_spec_function("lm_entry")
1446
+ def get_lm_entry_spec(task: str, method: str = ADAPT_GENERATION) -> RunSpec:
2378
1447
  scenario_spec = ScenarioSpec(
2379
- class_name="helm.benchmark.scenarios.koala_scenario.KoalaScenario",
2380
- args={},
1448
+ class_name="helm.benchmark.scenarios.lm_entry_scenario.LMEntryScenario",
1449
+ args={"task": task},
2381
1450
  )
1451
+ adapter_spec: AdapterSpec
1452
+ metric_specs: List[MetricSpec]
2382
1453
 
2383
- adapter_spec = get_instruct_adapter_spec()
1454
+ if method == ADAPT_MULTIPLE_CHOICE_JOINT:
1455
+ if task in ["first_letter", "last_letter", "first_word", "last_word", "word_before", "word_after"]:
1456
+ raise ValueError(f"Task {task} cannot be cast to multiple choice.")
1457
+
1458
+ adapter_spec = get_multiple_choice_adapter_spec(
1459
+ method=method,
1460
+ instructions="Answer the following multiple choice question with a single letter",
1461
+ input_noun="Question",
1462
+ output_noun="\nAnswer",
1463
+ )
1464
+ metric_specs = get_exact_match_metric_specs()
1465
+ elif method == ADAPT_GENERATION:
1466
+ adapter_spec = get_generation_adapter_spec(
1467
+ instructions="Answer the following question in one word.",
1468
+ input_noun="Q",
1469
+ output_noun="\nA",
1470
+ # Shouldn't use any stop sequences because the task is zero-shot and thus we
1471
+ # don't expect the model to magically figure out the output format.
1472
+ stop_sequences=[],
1473
+ # Set max_tokens to save tokens. The answer is a word so 10 tokens should suffice.
1474
+ max_tokens=10,
1475
+ )
1476
+ # It makes no sense to include non-quasi exact match metrics for this task.
1477
+ metric_specs = get_basic_metric_specs(["quasi_exact_match", "quasi_prefix_exact_match", "f1_score"])
1478
+ else:
1479
+ raise ValueError(f"Unknown method: {method}")
2384
1480
 
2385
1481
  return RunSpec(
2386
- name="koala",
1482
+ name=f"lm_entry:task={task},method={method}",
2387
1483
  scenario_spec=scenario_spec,
2388
1484
  adapter_spec=adapter_spec,
2389
- metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
2390
- groups=["koala"],
1485
+ metric_specs=metric_specs,
1486
+ groups=["lm_entry"],
2391
1487
  )
2392
1488
 
2393
1489
 
2394
- @run_spec_function("anthropic_hh_rlhf")
2395
- def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec:
1490
+ @run_spec_function("thai_exam")
1491
+ def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
2396
1492
  scenario_spec = ScenarioSpec(
2397
- class_name="helm.benchmark.scenarios.anthropic_hh_rlhf_scenario.AnthropicHHRLHFScenario",
2398
- args={"subset": subset},
1493
+ class_name="helm.benchmark.scenarios.thai_exam_scenario.ThaiExamScenario", args={"exam": exam}
2399
1494
  )
2400
1495
 
2401
- adapter_spec = get_instruct_adapter_spec()
2402
-
2403
- return RunSpec(
2404
- name=f"anthropic_hh_rlhf:subset={subset}",
2405
- scenario_spec=scenario_spec,
2406
- adapter_spec=adapter_spec,
2407
- metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
2408
- groups=["anthropic_hh_rlhf"],
2409
- )
2410
-
2411
-
2412
- @run_spec_function("cleva")
2413
- def get_cleva_spec(task: str, version: str, subtask: Optional[str] = None, prompt_id: int = 0) -> RunSpec:
2414
- from .scenarios.cleva_scenario import CLEVAScenario # noqa
2415
-
2416
- CLEVAScenario.download_dataset(task, version)
2417
-
2418
- _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id)
2419
- inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id)
2420
-
2421
- class_name_prefix = "".join([word.capitalize() for word in task.split("_")])
2422
- scenario_spec = ScenarioSpec(
2423
- class_name=f"helm.benchmark.scenarios.cleva_scenario.CLEVA{class_name_prefix}Scenario",
2424
- args={"version": version, "subtask": subtask, "prompt_id": prompt_id},
1496
+ adapter_spec = get_multiple_choice_adapter_spec(
1497
+ method=method,
1498
+ instructions="The following are multiple choice questions (with answers).",
1499
+ input_noun="Question",
1500
+ output_noun="Answer",
1501
+ max_train_instances=5,
2425
1502
  )
2426
- run_spec_name: str = f"cleva:task={task},version={version},prompt_id={prompt_id}"
2427
- if subtask:
2428
- run_spec_name += f",subtask={subtask}"
2429
-
2430
- if task in ["copyright"]:
2431
- adapter_spec = get_completion_adapter_spec(
2432
- temperature=inference_parameters.get("temperature", 0.2),
2433
- max_tokens=inference_parameters.get("max_tokens", 1024),
2434
- num_outputs=inference_parameters.get("num_outputs", 1),
2435
- )
2436
- args = {"normalize_by_prefix_length": True, "normalize_newline_space_tab": False}
2437
- metric_specs = get_cleva_copyright_metric_spec(args) + get_cleva_generative_harms_metric_specs()
2438
- elif task in ["code_synthesis"]:
2439
- adapter_spec = get_completion_adapter_spec(
2440
- instructions=prompt_setting.instructions,
2441
- temperature=inference_parameters.get("temperature", 0.2),
2442
- # Taken from the original OpenAI paper to prevent the further generation of irrelevant classes/functions
2443
- stop_sequences=inference_parameters.get("stop_sequences", ["\nclass", "\ndef", "\nif", "\nprint"]),
2444
- max_tokens=inference_parameters.get("max_tokens", 600),
2445
- )
2446
- metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"]) + get_cleva_generative_harms_metric_specs()
2447
- elif task in ["language_modeling"]:
2448
- adapter_spec = get_language_modeling_adapter_spec()
2449
- metric_specs = get_basic_metric_specs([])
2450
- else:
2451
- if prompt_setting.method in [
2452
- ADAPT_MULTIPLE_CHOICE_JOINT,
2453
- ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
2454
- ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
2455
- ]:
2456
- if prompt_setting.method == ADAPT_MULTIPLE_CHOICE_JOINT:
2457
- adapter_spec = AdapterSpec(
2458
- method=prompt_setting.method,
2459
- instructions=prompt_setting.instructions,
2460
- input_prefix=prompt_setting.input_prefix,
2461
- input_suffix=prompt_setting.input_suffix,
2462
- output_prefix=prompt_setting.output_prefix,
2463
- output_suffix=prompt_setting.output_suffix,
2464
- max_train_instances=inference_parameters.get("max_train_instances", 5),
2465
- num_outputs=inference_parameters.get("num_outputs", 5),
2466
- max_tokens=inference_parameters.get("max_tokens", 1),
2467
- temperature=inference_parameters.get("temperature", 0.0),
2468
- stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
2469
- sample_train=inference_parameters.get("sample_train", True),
2470
- multi_label=inference_parameters.get("multi_label", False),
2471
- )
2472
- else:
2473
- adapter_spec = AdapterSpec(
2474
- method=prompt_setting.method,
2475
- instructions=prompt_setting.instructions,
2476
- input_prefix=prompt_setting.input_prefix,
2477
- input_suffix=prompt_setting.input_suffix,
2478
- output_prefix=prompt_setting.output_prefix,
2479
- output_suffix=prompt_setting.output_suffix,
2480
- # Separate is basically language modeling, so can't easily use in-context examples
2481
- max_train_instances=inference_parameters.get("max_train_instances", 5),
2482
- num_outputs=1,
2483
- max_tokens=0,
2484
- temperature=inference_parameters.get("temperature", 0.0),
2485
- sample_train=inference_parameters.get("sample_train", True),
2486
- )
2487
- metric_specs = get_exact_match_metric_specs()
2488
- if task in ["fact_checking", "bias"]:
2489
- metric_specs += get_multiple_choice_classification_metric_specs()
2490
- elif prompt_setting.method == ADAPT_GENERATION:
2491
- adapter_spec = AdapterSpec(
2492
- method=prompt_setting.method,
2493
- instructions=prompt_setting.instructions,
2494
- input_prefix=prompt_setting.input_prefix,
2495
- input_suffix=prompt_setting.input_suffix,
2496
- output_prefix=prompt_setting.output_prefix,
2497
- output_suffix=prompt_setting.output_suffix,
2498
- max_train_instances=inference_parameters.get("max_train_instances", 5),
2499
- num_outputs=inference_parameters.get("num_outputs", 1),
2500
- max_tokens=inference_parameters.get("max_tokens", 20),
2501
- temperature=inference_parameters.get("temperature", 0.0),
2502
- stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
2503
- sample_train=inference_parameters.get("sample_train", True),
2504
- multi_label=inference_parameters.get("multi_label", True),
2505
- )
2506
- metric_specs = (
2507
- get_cleva_generative_task_metric_spec(task, subtask) + get_cleva_generative_harms_metric_specs()
2508
- )
2509
- else:
2510
- raise ValueError(
2511
- f"{task} can only be {ADAPT_GENERATION}, {ADAPT_MULTIPLE_CHOICE_JOINT}, "
2512
- f"{ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED} or {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL}"
2513
- )
2514
1503
 
2515
1504
  return RunSpec(
2516
- name=run_spec_name,
1505
+ name=f"thai_exam:exam={exam},method={method}",
2517
1506
  scenario_spec=scenario_spec,
2518
1507
  adapter_spec=adapter_spec,
2519
- metric_specs=metric_specs,
2520
- groups=["cleva", f"cleva_{task}"],
1508
+ metric_specs=get_exact_match_metric_specs(),
1509
+ groups=["thai_exam"],
2521
1510
  )
2522
-
2523
-
2524
- ############################################################
2525
-
2526
-
2527
- def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
2528
- """
2529
- Takes a specification (name, args) and returns a list of `RunSpec`s.
2530
- """
2531
- # Note that we are abusing `spec` a bit because the name is not actually a class name.
2532
- name = spec.class_name
2533
- args = spec.args
2534
-
2535
- if name not in CANONICAL_RUN_SPEC_FUNCS:
2536
- raise ValueError(f"Unknown run spec name: {name}")
2537
-
2538
- # Peel off the run expanders (e.g., model)
2539
- expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
2540
- args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
2541
-
2542
- # Get the canonical run specs
2543
- run_specs = [CANONICAL_RUN_SPEC_FUNCS[name](**args)]
2544
-
2545
- # Apply expanders
2546
- for expander in expanders:
2547
- run_specs = [
2548
- child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
2549
- ]
2550
-
2551
- def alter_run_spec(run_spec: RunSpec) -> RunSpec:
2552
- try:
2553
- model = get_model(run_spec.adapter_spec.model)
2554
- except ValueError:
2555
- # Models registered from configs cannot have expanders applied to them,
2556
- # because the models will not have been registered yet at this point.
2557
- # TODO: Figure out a cleaner way to deal with this.
2558
- return run_spec
2559
- # For models that strip newlines, when we're generating, we need to set
2560
- # the delimiter to be '###' so we stop properly.
2561
- if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
2562
- ADAPT_GENERATION,
2563
- ADAPT_MULTIPLE_CHOICE_JOINT,
2564
- ):
2565
- stop_expander = StopRunExpander(value="hash")
2566
- run_spec = singleton(stop_expander.expand(run_spec))
2567
-
2568
- if NLG_PREFIX_TAG in model.tags:
2569
- global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
2570
- run_spec = singleton(global_prefix_expander.expand(run_spec))
2571
-
2572
- # When running ChatGPT on non-language modelling tasks, increase max_tokens by 1
2573
- # to add room for the special message role token.
2574
- if OPENAI_CHATGPT_MODEL_TAG in model.tags and run_spec.adapter_spec.max_tokens:
2575
- increase_max_tokens_expander = IncreaseMaxTokensRunExpander(value=1)
2576
- run_spec = singleton(increase_max_tokens_expander.expand(run_spec))
2577
-
2578
- if CHATML_MODEL_TAG in model.tags:
2579
- chatml_expander = ChatMLRunExpander()
2580
- run_spec = singleton(chatml_expander.expand(run_spec))
2581
-
2582
- # Special handling for Anthropic Claude
2583
- if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
2584
- try:
2585
- import anthropic
2586
- from helm.proxy.clients.anthropic_client import AnthropicClient
2587
- except ModuleNotFoundError as e:
2588
- handle_module_not_found_error(e, ["anthropic"])
2589
- claude_run_expanders: List[RunExpander] = []
2590
- claude_run_expanders.append(AddToStopRunExpander(anthropic.HUMAN_PROMPT))
2591
- if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags:
2592
- claude_run_expanders.append(IncreaseMaxTokensRunExpander(value=AnthropicClient.ADDITIONAL_TOKENS))
2593
- # Get scenario tags
2594
- components = run_spec.scenario_spec.class_name.split(".")
2595
- class_name = components[-1]
2596
- module_name = ".".join(components[:-1])
2597
- cls = getattr(importlib.import_module(module_name), class_name)
2598
- scenario_tags: List[str] = cls.tags
2599
- # If the scenario is instruction, do not use PROMPT_ANSWER_START
2600
- if "instructions" in scenario_tags:
2601
- claude_run_expanders.append(
2602
- FormatPromptRunExpander(prefix=anthropic.HUMAN_PROMPT, suffix=f"{anthropic.AI_PROMPT}")
2603
- )
2604
- else:
2605
- claude_run_expanders.append(
2606
- FormatPromptRunExpander(
2607
- prefix=anthropic.HUMAN_PROMPT,
2608
- suffix=f"{anthropic.AI_PROMPT} {AnthropicClient.PROMPT_ANSWER_START}",
2609
- )
2610
- )
2611
- for claude_run_expander in claude_run_expanders:
2612
- run_spec = singleton(claude_run_expander.expand(run_spec))
2613
-
2614
- # For multiple choice
2615
- if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
2616
- increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
2617
- run_spec = singleton(increase_temperature_expander.expand(run_spec))
2618
-
2619
- return run_spec
2620
-
2621
- run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
2622
-
2623
- return run_specs