crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -2,16 +2,19 @@ from dataclasses import dataclass, field
2
2
  from typing import List, Optional
3
3
 
4
4
  from helm.common.image_generation_parameters import ImageGenerationParameters
5
+ from helm.common.reeval_parameters import REEvalParameters
5
6
 
6
7
 
7
8
  # Adaptation methods
8
9
  ADAPT_GENERATION: str = "generation"
10
+ ADAPT_CHAT: str = "chat"
9
11
  ADAPT_LANGUAGE_MODELING: str = "language_modeling"
10
12
  ADAPT_MULTIPLE_CHOICE_JOINT: str = "multiple_choice_joint"
13
+ ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: str = "multiple_choice_joint_chain_of_thought"
11
14
  ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: str = "multiple_choice_separate_original"
12
15
  ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: str = "multiple_choice_separate_calibrated"
13
16
  ADAPT_RANKING_BINARY: str = "ranking_binary"
14
-
17
+ ADAPT_EHR_INSTRUCTION: str = "ehr_instruction"
15
18
  ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS: List[str] = [
16
19
  ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
17
20
  ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
@@ -63,6 +66,12 @@ class AdapterSpec:
63
66
  reference_suffix: str = "\n"
64
67
  """The string that is included after each reference (for multiple-choice questions)."""
65
68
 
69
+ chain_of_thought_prefix: str = ""
70
+ """The string that is included before each chain of thought. (e.g., 'Let\'s think step by step')"""
71
+
72
+ chain_of_thought_suffix: str = "\n"
73
+ """The string that is included after each chain of thought. (e.g., 'The correct answer is')"""
74
+
66
75
  output_prefix: str = "Output: "
67
76
  """The string that is included before the correct answer/predicted output (e.g., 'Answer:')."""
68
77
 
@@ -124,6 +133,9 @@ class AdapterSpec:
124
133
  image_generation_parameters: Optional[ImageGenerationParameters] = None
125
134
  """Parameters for image generation."""
126
135
 
136
+ reeval_parameters: Optional[REEvalParameters] = None
137
+ """Parameters for reeval evaluation."""
138
+
127
139
  # Set hash=False to make `AdapterSpec` hashable
128
140
  eval_splits: Optional[List[str]] = field(default=None, hash=False)
129
141
  """The splits from which evaluation instances will be drawn."""
@@ -1,8 +1,11 @@
1
1
  from helm.benchmark.adaptation.adapter_spec import (
2
+ ADAPT_EHR_INSTRUCTION,
2
3
  ADAPT_GENERATION,
4
+ ADAPT_CHAT,
3
5
  ADAPT_GENERATION_MULTIMODAL,
4
6
  ADAPT_LANGUAGE_MODELING,
5
7
  ADAPT_MULTIPLE_CHOICE_JOINT,
8
+ ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
6
9
  ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
7
10
  ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
8
11
  ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
@@ -12,6 +15,7 @@ from helm.benchmark.adaptation.adapter_spec import (
12
15
  from helm.benchmark.adaptation.adapters.adapter import Adapter
13
16
  from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
14
17
  from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
18
+ from helm.benchmark.adaptation.adapters.chat_adapter import ChatAdapter
15
19
  from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
16
20
  from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
17
21
  from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
@@ -19,8 +23,12 @@ from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimo
19
23
  )
20
24
  from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
21
25
  from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
26
+ from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
27
+ MultipleChoiceJointChainOfThoughtAdapter,
28
+ )
22
29
  from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
23
30
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
31
+ from helm.benchmark.adaptation.adapters.ehr_instruction_adapter import EHRInstructionAdapter
24
32
 
25
33
 
26
34
  class AdapterFactory:
@@ -32,12 +40,18 @@ class AdapterFactory:
32
40
  method: str = adapter_spec.method
33
41
  adapter: Adapter
34
42
 
35
- if method == ADAPT_GENERATION:
43
+ if method == ADAPT_EHR_INSTRUCTION:
44
+ adapter = EHRInstructionAdapter(adapter_spec, tokenizer_service)
45
+ elif method == ADAPT_GENERATION:
36
46
  adapter = GenerationAdapter(adapter_spec, tokenizer_service)
47
+ elif method == ADAPT_CHAT:
48
+ adapter = ChatAdapter(adapter_spec, tokenizer_service)
37
49
  elif method == ADAPT_LANGUAGE_MODELING:
38
50
  adapter = LanguageModelingAdapter(adapter_spec, tokenizer_service)
39
51
  elif method == ADAPT_MULTIPLE_CHOICE_JOINT:
40
52
  adapter = MultipleChoiceJointAdapter(adapter_spec, tokenizer_service)
53
+ elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
54
+ adapter = MultipleChoiceJointChainOfThoughtAdapter(adapter_spec, tokenizer_service)
41
55
  elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL:
42
56
  adapter = MultipleChoiceSeparateAdapter(adapter_spec, tokenizer_service)
43
57
  elif method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED:
@@ -3,7 +3,7 @@ from typing import List, Optional
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
4
  from helm.benchmark.scenarios.scenario import Instance, Reference, TRAIN_SPLIT, EVAL_SPLITS, CORRECT_TAG
5
5
  from helm.common.request import Request
6
- from .in_context_learning_adapter import InContextLearningAdapter
6
+ from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
7
7
 
8
8
 
9
9
  class BinaryRankingAdapter(InContextLearningAdapter):
@@ -0,0 +1,49 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.scenarios.scenario import Instance
5
+ from helm.common.request import Request
6
+ from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
7
+
8
+
9
+ class ChatAdapter(InContextLearningAdapter):
10
+ """
11
+ Each `Instance` in a `Scenario` has a history of the format:
12
+
13
+ [
14
+ {"role": "user", "content": <user-content>},
15
+ {"role": "assistant", "content": <assistant-content>},
16
+ {"role": "user", "content": <user-content>},
17
+ ...
18
+ ]
19
+
20
+ """
21
+
22
+ def generate_requests(
23
+ self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
24
+ ) -> List[RequestState]:
25
+ if eval_instance.input.messages is None:
26
+ raise ValueError("ChatAdapter requires input.messages of instances to be non-empty")
27
+ request = Request(
28
+ model=self.adapter_spec.model,
29
+ model_deployment=self.adapter_spec.model_deployment,
30
+ messages=eval_instance.input.messages,
31
+ num_completions=self.adapter_spec.num_outputs,
32
+ temperature=self.adapter_spec.temperature,
33
+ max_tokens=self.adapter_spec.max_tokens,
34
+ stop_sequences=self.adapter_spec.stop_sequences,
35
+ random=self.adapter_spec.random,
36
+ image_generation_parameters=self.adapter_spec.image_generation_parameters,
37
+ )
38
+ request_state = RequestState(
39
+ instance=eval_instance,
40
+ reference_index=None,
41
+ request_mode=None,
42
+ train_trial_index=train_trial_index,
43
+ output_mapping=None,
44
+ request=request,
45
+ result=None,
46
+ num_train_instances=0,
47
+ prompt_truncated=False,
48
+ )
49
+ return [request_state]
@@ -0,0 +1,108 @@
1
+ from typing import List, Optional
2
+
3
+ from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
4
+ from helm.benchmark.adaptation.prompt import Prompt
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, Instance
7
+ from helm.benchmark.window_services.window_service import EncodeResult
8
+ from helm.common.tokenization_request import TokenizationToken
9
+
10
+
11
+ # in the prompt templates for EHR instructions, this is the placeholder for the EHR part
12
+ # which we use to compute accurate tokenized sequence lengths
13
+ PROMPT_TEMPLATE_EHR_PLACEHOLDER = "{ehr}"
14
+
15
+
16
+ class EHRInstructionAdapter(GenerationAdapter):
17
+ """
18
+ Each instance consists of the following:
19
+
20
+ EHRInstructionInput:
21
+ question: the question to answer or instruction to follow
22
+ ehr: the XML-tagged EHR to use as context to answer the question
23
+ prompt_template: a string template for how to combine the question + ehr
24
+
25
+ Reference output:
26
+ text: the 'golden' clinician response to the question
27
+
28
+ This Adapter combines the above into RequestStates with logic to truncate the EHR specifically
29
+ to fit in the context window with enough room for the instruction/question and the specified
30
+ amount of generated tokens.
31
+ """
32
+
33
+ def adapt(self, instances: List[Instance], parallelism: int) -> List[RequestState]:
34
+ """
35
+ Main adaptation method which takes all instances and turns them into `RequestState` objects.
36
+ """
37
+ # sanity check, since for now we assume that there are no training instances at all
38
+ if any(instance.split == TRAIN_SPLIT for instance in instances):
39
+ raise RuntimeError(f"Got train instances for {self.__class__.__name__} - expected only eval instances.")
40
+
41
+ # use superclass implementation here
42
+ return super().adapt(instances, parallelism)
43
+
44
+ def construct_prompt(
45
+ self,
46
+ train_instances: List[Instance], # unused
47
+ eval_instance: Instance,
48
+ include_output: bool, # unused
49
+ reference_index: Optional[int], # unused
50
+ ) -> Prompt:
51
+ """
52
+ Uses the instance to construct a prompt for a given eval instance.
53
+
54
+ Parameters
55
+ ----------
56
+ eval_instance: Instance
57
+ the instance we wish to use to construct the prompt
58
+ """
59
+ # start by simply getting the inputs
60
+ question = eval_instance.input.text
61
+ assert eval_instance.extra_data is not None
62
+ ehr_text: str = eval_instance.extra_data["ehr"]
63
+ prompt_template: str = eval_instance.extra_data["prompt_template"]
64
+ full_prompt_text = prompt_template.format(question=question, ehr=ehr_text)
65
+
66
+ # insert the question and see how many tokens we have so far
67
+ prompt_with_instr_no_ehr_placeholder = prompt_template.format(question=question, ehr="")
68
+ num_tokens_no_ehr = self.window_service.get_num_tokens(prompt_with_instr_no_ehr_placeholder)
69
+
70
+ # number of tokens we can allow the EHR part to be
71
+ target_ehr_num_tokens = (
72
+ self.window_service.max_request_length - self.adapter_spec.max_tokens - num_tokens_no_ehr
73
+ )
74
+
75
+ # round-trip tokenization to get the correct token length we need
76
+ # NOTE: we truncate from the left side so that the most recent pieces of the EHR are included in the context
77
+ # as opposed to the canonical way of truncating from the right. This is done to match the MedAlign method.
78
+ full_ehr_tokens: EncodeResult = self.window_service.encode(ehr_text, max_length=None, truncation=False)
79
+ truncated_ehr_tokens: List[TokenizationToken] = full_ehr_tokens.tokens[-target_ehr_num_tokens:]
80
+ ehr_truncated: str
81
+ ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
82
+
83
+ # create the truncated prompt
84
+ truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
85
+ num_truncations = 1
86
+ while (
87
+ num_extra_tokens := self.adapter_spec.max_tokens
88
+ + self.window_service.get_num_tokens(truncated_prompt_text)
89
+ - self.window_service.max_request_length
90
+ ) > 0:
91
+ truncated_ehr_tokens = truncated_ehr_tokens[num_extra_tokens:]
92
+ ehr_truncated = self.window_service.decode(truncated_ehr_tokens)
93
+ truncated_prompt_text = prompt_template.format(question=question, ehr=ehr_truncated)
94
+ num_truncations += 1
95
+
96
+ # naively construct the full non-truncated prompt
97
+ prompt = Prompt(
98
+ global_prefix=self.adapter_spec.global_prefix,
99
+ global_suffix=self.adapter_spec.global_suffix,
100
+ instance_prefix=self.adapter_spec.instance_prefix,
101
+ substitutions=self.adapter_spec.substitutions,
102
+ instructions_block=self.adapter_spec.instructions,
103
+ train_instance_blocks=[],
104
+ eval_instance_block=full_prompt_text,
105
+ truncated_text=truncated_prompt_text,
106
+ )
107
+
108
+ return prompt
@@ -4,7 +4,7 @@ from helm.benchmark.adaptation.prompt import Prompt
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
5
  from helm.benchmark.scenarios.scenario import Instance
6
6
  from helm.common.request import Request
7
- from .in_context_learning_adapter import InContextLearningAdapter
7
+ from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
8
8
 
9
9
 
10
10
  class GenerationAdapter(InContextLearningAdapter):
@@ -11,7 +11,7 @@ from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS
11
11
  from helm.common.general import parallel_map
12
12
  from helm.common.request import Request
13
13
  from helm.common.hierarchical_logger import hlog, htrack, htrack_block
14
- from .adapter import Adapter
14
+ from helm.benchmark.adaptation.adapters.adapter import Adapter
15
15
 
16
16
 
17
17
  class InContextLearningAdapter(Adapter, ABC):
@@ -7,7 +7,7 @@ from helm.common.general import flatten_list, parallel_map
7
7
  from helm.common.hierarchical_logger import hlog, htrack
8
8
  from helm.common.request import Request
9
9
  from helm.common.tokenization_request import TokenizationToken
10
- from .adapter import Adapter
10
+ from helm.benchmark.adaptation.adapters.adapter import Adapter
11
11
 
12
12
 
13
13
  class LanguageModelingAdapter(Adapter):
@@ -3,8 +3,10 @@ from typing import List
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
4
  from helm.benchmark.scenarios.scenario import Instance
5
5
  from helm.common.request import Request
6
- from .in_context_learning_multimodal_adapter import InContextLearningMultimodalAdapter
7
- from .multimodal_prompt import MultimodalPrompt
6
+ from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
7
+ InContextLearningMultimodalAdapter,
8
+ )
9
+ from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
8
10
 
9
11
 
10
12
  class GenerationMultimodalAdapter(InContextLearningMultimodalAdapter):
@@ -8,7 +8,7 @@ from helm.common.hierarchical_logger import hlog
8
8
  from helm.common.media_object import MediaObject, MultimediaObject
9
9
  from helm.common.request import Request
10
10
  from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
11
- from .multimodal_prompt import MultimodalPrompt
11
+ from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
12
12
 
13
13
 
14
14
  class InContextLearningMultimodalAdapter(InContextLearningAdapter, ABC):
@@ -8,7 +8,7 @@ from helm.common.request import Request
8
8
  from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
9
9
  InContextLearningMultimodalAdapter,
10
10
  )
11
- from .multimodal_prompt import MultimodalPrompt
11
+ from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
12
12
 
13
13
 
14
14
  class MultipleChoiceJointMultimodalAdapter(InContextLearningMultimodalAdapter, ABC):
@@ -8,8 +8,10 @@ from helm.benchmark.scenarios.scenario import Instance, Reference, Input, Output
8
8
  from helm.benchmark.window_services.test_utils import get_tokenizer_service
9
9
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
10
10
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION_MULTIMODAL, AdapterFactory
11
- from .in_context_learning_multimodal_adapter import InContextLearningMultimodalAdapter
12
- from .multimodal_prompt import MultimodalPrompt
11
+ from helm.benchmark.adaptation.adapters.multimodal.in_context_learning_multimodal_adapter import (
12
+ InContextLearningMultimodalAdapter,
13
+ )
14
+ from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
13
15
 
14
16
 
15
17
  class TestInContextLearningMultimodalAdapter(unittest.TestCase):
@@ -2,7 +2,7 @@ from typing import List
2
2
  import unittest
3
3
 
4
4
  from helm.common.media_object import MediaObject, MultimediaObject
5
- from .multimodal_prompt import MultimodalPrompt
5
+ from helm.benchmark.adaptation.adapters.multimodal.multimodal_prompt import MultimodalPrompt
6
6
 
7
7
 
8
8
  class TestMultimodalContent(unittest.TestCase):
@@ -3,7 +3,7 @@ from typing import List
3
3
 
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
5
  from helm.benchmark.scenarios.scenario import Instance, Input
6
- from .multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
6
+ from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
7
7
 
8
8
 
9
9
  class MultipleChoiceCalibratedAdapter(MultipleChoiceSeparateAdapter):
@@ -3,7 +3,7 @@ from typing import List, Dict, Optional
3
3
  from helm.benchmark.adaptation.request_state import RequestState
4
4
  from helm.benchmark.scenarios.scenario import Instance
5
5
  from helm.common.request import Request
6
- from .in_context_learning_adapter import InContextLearningAdapter
6
+ from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
7
7
 
8
8
 
9
9
  class MultipleChoiceJointAdapter(InContextLearningAdapter):
@@ -40,7 +40,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
40
40
 
41
41
  @staticmethod
42
42
  def get_prefix_char(prefix: str) -> str:
43
- return prefix.lstrip()[0]
43
+ return [char for char in prefix if char.isalnum()][0]
44
44
 
45
45
  @staticmethod
46
46
  def get_reference_prefix(prefix: str, i: int) -> str:
@@ -0,0 +1,87 @@
1
+ from typing import Optional
2
+
3
+ from helm.benchmark.scenarios.scenario import Instance
4
+ from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
5
+
6
+
7
+ class MultipleChoiceJointChainOfThoughtAdapter(MultipleChoiceJointAdapter):
8
+ """
9
+ Each `Instance` in a `Scenario` looks like this:
10
+
11
+ <input> -> <reference1>
12
+ <reference2>
13
+ <reference3> [correct]
14
+ <reference4>
15
+
16
+ <instance_chain_of_thought>
17
+
18
+ We can define a label (e.g., letter) for each reference:
19
+
20
+ <global_prefix>
21
+ <instructions>
22
+ <input_prefix>
23
+ <input> # train
24
+ <input_suffix>
25
+ A. <reference1>
26
+ B. <reference2>
27
+ C. <reference3>
28
+ D. <reference4>
29
+ <output_prefix>
30
+ <chain_of_thought_prefix>
31
+ <instance_chain_of_thought>
32
+ <chain_of_thought_suffix>
33
+ <output>
34
+ <output_suffix>
35
+
36
+ <input_prefix>
37
+ <input> # test
38
+ <input_suffix>
39
+ A. <reference1>
40
+ B. <reference2>
41
+ C. <reference3>
42
+ D. <reference4>
43
+ <output_prefix>
44
+ <chain_of_thought_prefix>
45
+ <instance_chain_of_thought>
46
+ <chain_of_thought_suffix>
47
+ <output>
48
+ <output_suffix>
49
+ <global_suffix>
50
+
51
+ In general, each example is:
52
+
53
+ <input_prefix><input><input_suffix><reference_prefixes[index]><reference> \
54
+ <output_prefix><chain_of_thought_prefix><chain_of_thought><chain_of_thought_suffix><output><output_suffix>
55
+ """
56
+
57
+ def construct_example_prompt(self, instance: Instance, include_output: bool, reference_index: Optional[int]) -> str:
58
+ """Return a list of lines corresponding to this example (part of the prompt)."""
59
+ # Input
60
+ result: str = self.adapter_spec.input_prefix + instance.input.text + self.adapter_spec.input_suffix
61
+
62
+ # Include the references
63
+ delimiter = ", "
64
+ no_correct_references = "n/a"
65
+ output = no_correct_references
66
+ for reference_index, reference in enumerate(instance.references):
67
+ prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
68
+ result += prefix + reference.output.text + self.adapter_spec.reference_suffix
69
+ if reference.is_correct:
70
+ if output == no_correct_references:
71
+ output = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
72
+ elif self.adapter_spec.multi_label:
73
+ output += delimiter
74
+ output += self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
75
+
76
+ if include_output:
77
+ chain_of_thought = instance.extra_data.get("chain_of_thought", "") if instance.extra_data else ""
78
+ chain_of_thought_block = (
79
+ self.adapter_spec.chain_of_thought_prefix + chain_of_thought + self.adapter_spec.chain_of_thought_suffix
80
+ )
81
+ result += (
82
+ self.adapter_spec.output_prefix + chain_of_thought_block + output + self.adapter_spec.output_suffix
83
+ )
84
+ else:
85
+ result += self.adapter_spec.output_prefix.rstrip()
86
+
87
+ return result
@@ -4,7 +4,7 @@ from helm.benchmark.adaptation.prompt import Prompt
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
5
  from helm.benchmark.scenarios.scenario import Instance
6
6
  from helm.common.request import Request
7
- from .in_context_learning_adapter import InContextLearningAdapter
7
+ from helm.benchmark.adaptation.adapters.in_context_learning_adapter import InContextLearningAdapter
8
8
 
9
9
 
10
10
  class MultipleChoiceSeparateAdapter(InContextLearningAdapter):
@@ -14,9 +14,9 @@ from helm.benchmark.scenarios.scenario import (
14
14
  from helm.benchmark.run_specs.simple_run_specs import get_simple1_spec
15
15
  from helm.benchmark.adaptation.prompt import Prompt
16
16
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
17
- from .adapter_factory import AdapterFactory, ADAPT_GENERATION
18
- from .generation_adapter import GenerationAdapter
19
- from .test_adapter import TestAdapter
17
+ from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_GENERATION
18
+ from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
19
+ from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
20
20
 
21
21
 
22
22
  class TestGenerationAdapter(TestAdapter):
@@ -5,8 +5,8 @@ from helm.common.tokenization_request import TokenizationToken
5
5
  from helm.benchmark.adaptation.request_state import RequestState
6
6
  from helm.common.request import Request
7
7
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
8
- from .adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
9
- from .test_adapter import TestAdapter
8
+ from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_LANGUAGE_MODELING
9
+ from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
10
10
  from helm.benchmark.scenarios.scenario import TEST_SPLIT, Instance, Input, Reference
11
11
 
12
12
 
@@ -2,8 +2,8 @@
2
2
  from typing import List, Set
3
3
  from helm.benchmark.scenarios.scenario import TEST_SPLIT, TRAIN_SPLIT, Instance, Input, Output, Reference, CORRECT_TAG
4
4
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
- from .adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
6
- from .test_adapter import TestAdapter
5
+ from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
6
+ from helm.benchmark.adaptation.adapters.test_adapter import TestAdapter
7
7
 
8
8
 
9
9
  def _make_instance(
@@ -4,6 +4,7 @@ from helm.benchmark.adaptation.adapter_spec import (
4
4
  ADAPT_GENERATION,
5
5
  ADAPT_LANGUAGE_MODELING,
6
6
  ADAPT_MULTIPLE_CHOICE_JOINT,
7
+ ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
7
8
  ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
8
9
  ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
9
10
  ADAPT_RANKING_BINARY,
@@ -43,13 +44,66 @@ def get_multiple_choice_joint_adapter_spec(
43
44
  [output_noun]:
44
45
  """
45
46
 
47
+ input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
48
+ input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
49
+ output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
50
+ output_suffix = kwargs.pop("output_suffix", "\n")
51
+
46
52
  return AdapterSpec(
47
53
  method=ADAPT_MULTIPLE_CHOICE_JOINT,
48
54
  instructions=format_instructions(instructions),
49
- input_prefix=f"{input_noun}: " if input_noun is not None else "",
50
- input_suffix="\n" if input_noun is not None else "",
51
- output_prefix=f"{output_noun}: ",
52
- output_suffix="\n",
55
+ input_prefix=input_prefix,
56
+ input_suffix=input_suffix,
57
+ output_prefix=output_prefix,
58
+ output_suffix=output_suffix,
59
+ max_train_instances=max_train_instances,
60
+ num_outputs=num_outputs,
61
+ max_tokens=max_tokens,
62
+ temperature=0.0,
63
+ stop_sequences=["\n"],
64
+ sample_train=sample_train,
65
+ **kwargs,
66
+ )
67
+
68
+
69
+ def get_multiple_choice_joint_chain_of_thought_adapter_spec(
70
+ instructions: str,
71
+ input_noun: Optional[str],
72
+ output_noun: str,
73
+ num_outputs: int = 5,
74
+ max_train_instances: int = 5,
75
+ max_tokens: int = 5,
76
+ sample_train: bool = True,
77
+ **kwargs,
78
+ ) -> AdapterSpec:
79
+ """
80
+ [instructions]
81
+
82
+ [input_noun]: [input]
83
+ [reference_1]
84
+ ...
85
+ [reference_k]
86
+ [output_noun]: [output]
87
+
88
+ [input_noun]: [input]
89
+ [reference_1]
90
+ ...
91
+ [reference_k]
92
+ [output_noun]:
93
+ """
94
+
95
+ input_prefix = kwargs.pop("input_prefix", f"{input_noun}: " if input_noun is not None else "")
96
+ input_suffix = kwargs.pop("input_suffix", "\n" if input_noun is not None else "")
97
+ output_prefix = kwargs.pop("output_prefix", f"{output_noun}: ")
98
+ output_suffix = kwargs.pop("output_suffix", "\n")
99
+
100
+ return AdapterSpec(
101
+ method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
102
+ instructions=format_instructions(instructions),
103
+ input_prefix=input_prefix,
104
+ input_suffix=input_suffix,
105
+ output_prefix=output_prefix,
106
+ output_suffix=output_suffix,
53
107
  max_train_instances=max_train_instances,
54
108
  num_outputs=num_outputs,
55
109
  max_tokens=max_tokens,
@@ -109,6 +163,17 @@ def get_multiple_choice_adapter_spec(
109
163
  sample_train=sample_train,
110
164
  **kwargs,
111
165
  )
166
+ elif method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
167
+ return get_multiple_choice_joint_chain_of_thought_adapter_spec(
168
+ instructions,
169
+ input_noun,
170
+ output_noun,
171
+ max_train_instances=max_train_instances,
172
+ num_outputs=num_outputs,
173
+ max_tokens=max_tokens,
174
+ sample_train=sample_train,
175
+ **kwargs,
176
+ )
112
177
  elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
113
178
  return get_multiple_choice_separate_adapter_spec(method, empty_input)
114
179
  else:
@@ -2,7 +2,7 @@ import re
2
2
  from dataclasses import dataclass
3
3
  from typing import List, Optional
4
4
 
5
- from .adapter_spec import Substitution
5
+ from helm.benchmark.adaptation.adapter_spec import Substitution
6
6
 
7
7
 
8
8
  @dataclass(frozen=True)