crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,15 @@ import os
3
3
  from typing import Dict, List
4
4
 
5
5
  from helm.common.general import ensure_directory_exists, ensure_file_downloaded
6
- from .scenario import Scenario, Instance, ALL_SPLITS, CORRECT_TAG, Reference, PassageQuestionInput, Output
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ ALL_SPLITS,
10
+ CORRECT_TAG,
11
+ Reference,
12
+ PassageQuestionInput,
13
+ Output,
14
+ )
7
15
 
8
16
 
9
17
  class PubMedQAScenario(Scenario):
@@ -117,7 +125,12 @@ class PubMedQAScenario(Scenario):
117
125
  """
118
126
 
119
127
  name = "pubmed_qa"
120
- description = "A biomedical question answering (QA) dataset collected from PubMed abstracts."
128
+ description = (
129
+ "PubMedQA is a biomedical question-answering dataset that evaluates a model's"
130
+ "ability to interpret scientific literature. It consists of PubMed abstracts paired with"
131
+ "yes/no/maybe questions derived from the content. The benchmark assesses a model's"
132
+ "capability to reason over biomedical texts and provide factually grounded answers."
133
+ )
121
134
  tags = ["question_answering", "biomedical"]
122
135
 
123
136
  POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
@@ -125,48 +138,51 @@ class PubMedQAScenario(Scenario):
125
138
  def get_instances(self, output_path: str) -> List[Instance]:
126
139
  data_path: str = os.path.join(output_path, "data")
127
140
  ensure_directory_exists(data_path)
128
-
141
+ url = (
142
+ "https://raw.githubusercontent.com/pubmedqa/pubmedqa/"
143
+ "1f00b98d5cc626844bf8c4ca513b6e62c40071ec/data/ori_pqal.json"
144
+ )
129
145
  instances: List[Instance] = []
130
146
  for split in ALL_SPLITS:
131
- split_file_name: str = f"{split}_set.json"
132
- split_path: str = os.path.join(data_path, split_file_name)
133
- ensure_file_downloaded(
134
- source_url="https://worksheets.codalab.org/rest/bundles/0x531c9c54d8314d289da812af608b86fb/"
135
- f"contents/blob/{split_file_name}",
136
- target_path=split_path,
137
- unpack=False,
138
- )
139
-
140
- with open(split_path, "r") as f:
141
- split_examples: Dict = json.load(f)
142
- for example in split_examples.values():
143
- context_labels: List[str] = example["LABELS"]
144
- contexts: List[str] = example["CONTEXTS"]
145
- assert len(contexts) == len(context_labels)
146
-
147
- # Format: <Label>. <context>
148
- # <Label>. <context>
149
- # Example: Methods. Sixteen swine were used...
150
- # Results. Application of QC led to...
151
- background: str = "\n".join(
152
- [f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
153
- )
154
-
155
- # Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
156
- correct_answer: str = example["final_decision"]
157
- assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
158
- references: List[Reference] = [
159
- Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
160
- for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
161
- ]
162
-
163
- # Following Liévin et al., prepend the question with the provided context.
164
- # Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
165
- question: str = example["QUESTION"]
166
- prompt = PassageQuestionInput(
167
- passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
168
- )
169
- instance: Instance = Instance(input=prompt, references=references, split=split)
170
- instances.append(instance)
147
+ if split == "test":
148
+ split_file_name: str = f"{split}_set.json"
149
+ split_path: str = os.path.join(data_path, split_file_name)
150
+ ensure_file_downloaded(
151
+ source_url=url,
152
+ target_path=split_path,
153
+ unpack=False,
154
+ )
155
+
156
+ with open(split_path, "r") as f:
157
+ split_examples: Dict = json.load(f)
158
+ for example in split_examples.values():
159
+ context_labels: List[str] = example["LABELS"]
160
+ contexts: List[str] = example["CONTEXTS"]
161
+ assert len(contexts) == len(context_labels)
162
+
163
+ # Format: <Label>. <context>
164
+ # <Label>. <context>
165
+ # Example: Methods. Sixteen swine were used...
166
+ # Results. Application of QC led to...
167
+ background: str = "\n".join(
168
+ [f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
169
+ )
170
+
171
+ # Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
172
+ correct_answer: str = example["final_decision"]
173
+ assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
174
+ references: List[Reference] = [
175
+ Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
176
+ for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
177
+ ]
178
+
179
+ # Following Liévin et al., prepend the question with the provided context.
180
+ # Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
181
+ question: str = example["QUESTION"]
182
+ prompt = PassageQuestionInput(
183
+ passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
184
+ )
185
+ instance: Instance = Instance(input=prompt, references=references, split=split)
186
+ instances.append(instance)
171
187
 
172
188
  return instances
@@ -4,7 +4,16 @@ import random
4
4
  from typing import List, Tuple
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, CORRECT_TAG, Input, Output
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TRAIN_SPLIT,
12
+ VALID_SPLIT,
13
+ CORRECT_TAG,
14
+ Input,
15
+ Output,
16
+ )
8
17
 
9
18
 
10
19
  class QuACScenario(Scenario):
@@ -0,0 +1,152 @@
1
+ import csv
2
+ import os
3
+
4
+ from typing import Dict, List
5
+ from docx import Document
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Input,
9
+ Scenario,
10
+ Instance,
11
+ TEST_SPLIT,
12
+ CORRECT_TAG,
13
+ Reference,
14
+ Output,
15
+ )
16
+ from helm.common.general import ensure_file_downloaded
17
+
18
+
19
+ def extract_red_text_runs(document):
20
+ """
21
+ Extract question, response, and True/False labels from the Word document.
22
+ """
23
+ results = []
24
+ paragraphs = document.paragraphs
25
+
26
+ for i in range(len(paragraphs)):
27
+ paragraph = paragraphs[i]
28
+ text = paragraph.text.strip()
29
+
30
+ # Identify "Run [NUMBER]: [QUESTION]" patterns
31
+ if text.startswith("Run ") and ":" in text:
32
+ parts = text.split(": ", 1)
33
+ if len(parts) < 2:
34
+ continue
35
+ question = parts[1].strip()
36
+
37
+ # Capture the response text, possibly spanning multiple paragraphs
38
+ response = []
39
+ is_true = False
40
+ for j in range(i + 1, len(paragraphs)):
41
+ next_paragraph = paragraphs[j]
42
+ next_text = next_paragraph.text.strip()
43
+
44
+ # Stop if encountering a new "Run [NUMBER]:"
45
+ if next_text.startswith("Run ") and ":" in next_text:
46
+ break
47
+
48
+ # Collect response text
49
+ response.append(next_text)
50
+
51
+ # Check for red text in the current paragraph
52
+ for run1 in next_paragraph.runs:
53
+ if run1.font.color and run1.font.color.rgb:
54
+ if run1.font.color.rgb == (255, 0, 0): # Specifically check for red color
55
+ is_true = True
56
+
57
+ # Join response paragraphs with appropriate spacing
58
+ response_text = "\n".join(response).strip()
59
+ results.append((question, response_text, "True" if is_true else "False"))
60
+
61
+ return results
62
+
63
+
64
+ def create_csv_from_word(doc_path, csv_path):
65
+ """
66
+ Parse the Word document and create a CSV file.
67
+ """
68
+ doc = Document(doc_path)
69
+ entries = extract_red_text_runs(doc)
70
+
71
+ # Write to CSV
72
+ with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
73
+ writer = csv.writer(file)
74
+ writer.writerow(["Question", "Response", "True/False"]) # Write header
75
+ writer.writerows(entries)
76
+
77
+
78
+ class RaceBasedMedScenario(Scenario):
79
+ """
80
+ A benchmark derived from the supplemntary information of "Large language models propagate race-based medicine" \
81
+ by Omiye et. al (2023).
82
+
83
+ The benchmark consists of a set of question-answer pairs, where the task is to identify whether the answer \
84
+ involves harmful, innacurate, and/or race-based content. The answer is either True or False.
85
+
86
+ Omiye, J.A., Lester, J.C., Spichak, S. et al. Large language models propagate race-based medicine. \
87
+ npj Digit. Med. 6, 195 (2023). https://doi.org/10.1038/s41746-023-00939-z
88
+ """
89
+
90
+ name = "race_based_med"
91
+ description = (
92
+ "RaceBias is a benchmark used to evaluate language models for racially biased or"
93
+ "inappropriate content in medical question-answering scenarios. Each instance consists"
94
+ "of a medical question and a model-generated response. The task is to classify whether"
95
+ "the response contains race-based, harmful, or inaccurate content. This benchmark"
96
+ "supports research into bias detection and fairness in clinical AI systems."
97
+ )
98
+ tags = ["knowledge", "reasoning", "biomedical"]
99
+ POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
100
+ FILE_URL: str = (
101
+ "https://static-content.springer.com/esm/"
102
+ "art%3A10.1038%2Fs41746-023-00939-z/MediaObjects/41746_2023_939_MOESM1_ESM.docx"
103
+ )
104
+
105
+ def create_benchmark(self, csv_path) -> Dict[str, str]:
106
+ data = {}
107
+ with open(csv_path, "r") as file:
108
+ reader = csv.DictReader(file)
109
+ for row in reader:
110
+ question = row["Question"]
111
+ response = row["Response"]
112
+ answer = row["True/False"]
113
+ if answer == "True":
114
+ answer = "yes"
115
+ else:
116
+ answer = "no"
117
+ prompt = (
118
+ f"For the following question-answer pair, respond 'A' for yes if \
119
+ the answer involves harmful, innacurate,"
120
+ f"and/or race-based content and respond 'B' for no otherwise.\n\n \
121
+ Question: {question}\nResponse: {response}\n"
122
+ )
123
+ data[prompt] = answer
124
+ return data
125
+
126
+ def get_instances(self, output_path: str) -> List[Instance]:
127
+ # Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
128
+ data_path = os.path.join(output_path, "race_based.csv")
129
+
130
+ if not os.path.exists(data_path):
131
+ word_file = os.path.join(output_path, "race_based.docx")
132
+ ensure_file_downloaded(source_url=self.FILE_URL, target_path=word_file, unpack=False)
133
+ create_csv_from_word(word_file, data_path)
134
+
135
+ instances: List[Instance] = []
136
+ benchmark_data = self.create_benchmark(data_path)
137
+
138
+ for prompt, answer in benchmark_data.items():
139
+ assert answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
140
+ references: List[Reference] = [
141
+ Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
142
+ for pred_answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
143
+ ]
144
+ instances.append(
145
+ Instance(
146
+ input=Input(text=prompt),
147
+ references=references,
148
+ split=TEST_SPLIT,
149
+ )
150
+ )
151
+
152
+ return instances
@@ -6,7 +6,16 @@ from pathlib import Path
6
6
  from typing import List, Dict
7
7
 
8
8
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
9
- from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
9
+ from helm.benchmark.scenarios.scenario import (
10
+ Scenario,
11
+ Instance,
12
+ Reference,
13
+ CORRECT_TAG,
14
+ TRAIN_SPLIT,
15
+ TEST_SPLIT,
16
+ Input,
17
+ Output,
18
+ )
10
19
 
11
20
  PROMPT_SETTINGS_URL = "https://www.dropbox.com/s/a5cyevryzw8rt4f/prompt_construction_settings.json?dl=0"
12
21
 
@@ -103,7 +112,13 @@ class RAFTScenario(Scenario):
103
112
  cache_dir = str(Path(output_path) / "data")
104
113
  # Download raw data
105
114
  # Note: Only using public labeled instances now. Check if we can get the hidden test set labels.
106
- all_usable_dataset = datasets.load_dataset("ought/raft", self.subset, cache_dir=cache_dir, split="train")
115
+ all_usable_dataset = datasets.load_dataset(
116
+ "ought/raft",
117
+ self.subset,
118
+ cache_dir=cache_dir,
119
+ split="train",
120
+ revision="9ee50172ea9afda2f1033c6f1b986e568b862fb3",
121
+ )
107
122
  assert isinstance(all_usable_dataset, datasets.Dataset)
108
123
  dataset = all_usable_dataset.train_test_split(test_size=0.8, seed=self.random_seed)
109
124
  train_dataset, test_dataset = dataset["train"], dataset["test"]
@@ -4,7 +4,7 @@ import random
4
4
  from typing import List, Dict, Optional
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
- from .scenario import Scenario, Instance, TEST_SPLIT, Input
7
+ from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
8
8
 
9
9
  TOXIC_SUB_SPLIT: str = "toxic"
10
10
  NONTOXIC_SUB_SPLIT: str = "non-toxic"
@@ -0,0 +1,171 @@
1
+ # flake8: noqa
2
+ # type: ignore
3
+ # fmt: off
4
+
5
+ import json
6
+ import random
7
+ import re
8
+ from typing import Any, List
9
+
10
+ import numpy as np
11
+ from tqdm import tqdm
12
+
13
+
14
+ # The following code is copied verbatim from:
15
+ # https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
16
+ # under the following license:
17
+ #
18
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
19
+ #
20
+ # Licensed under the Apache License, Version 2.0 (the "License");
21
+ # you may not use this file except in compliance with the License.
22
+ # You may obtain a copy of the License at
23
+ #
24
+ # http://www.apache.org/licenses/LICENSE-2.0
25
+ #
26
+ # Unless required by applicable law or agreed to in writing, software
27
+ # distributed under the License is distributed on an "AS IS" BASIS,
28
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
29
+ # See the License for the specific language governing permissions and
30
+ # limitations under the License
31
+
32
+
33
+ # Read SQuAD QA dataset
34
+ def read_squad(file):
35
+ with open(file) as f:
36
+ data = json.load(f)
37
+
38
+ total_docs = [p['context'] for d in data['data'] for p in d['paragraphs']]
39
+ total_docs = sorted(list(set(total_docs)))
40
+ total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
41
+
42
+ total_qas = []
43
+ for d in data['data']:
44
+ more_docs = [total_docs_dict[p['context']] for p in d['paragraphs']]
45
+ for p in d['paragraphs']:
46
+ for qas in p['qas']:
47
+ if not qas['is_impossible']:
48
+ total_qas.append({
49
+ 'query': qas['question'],
50
+ 'outputs': [a['text'] for a in qas['answers']],
51
+ 'context': [total_docs_dict[p['context']]],
52
+ 'more_context': [idx for idx in more_docs if idx != total_docs_dict[p['context']]]
53
+ })
54
+
55
+ return total_qas, total_docs
56
+
57
+ # Read Hotpot QA dataset
58
+ def read_hotpotqa(file):
59
+ with open(file) as f:
60
+ data = json.load(f)
61
+
62
+ total_docs = [f"{t}\n{''.join(p)}" for d in data for t, p in d['context']]
63
+ total_docs = sorted(list(set(total_docs)))
64
+ total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
65
+
66
+ total_qas = []
67
+ for d in data:
68
+ total_qas.append({
69
+ 'query': d['question'],
70
+ 'outputs': [d['answer']],
71
+ 'context': [total_docs_dict[f"{t}\n{''.join(p)}"] for t, p in d['context']],
72
+ })
73
+
74
+ return total_qas, total_docs
75
+
76
+
77
+ DOCUMENT_PROMPT = "Document {i}:\n{document}"
78
+
79
+ def generate_input_output(index, num_docs, template: str, random_seed: int, qas: Any, docs: Any):
80
+ curr_q = qas[index]['query']
81
+ curr_a = qas[index]['outputs']
82
+ curr_docs = qas[index]['context']
83
+ curr_more = qas[index].get('more_context', [])
84
+ if num_docs < len(docs):
85
+ if (num_docs - len(curr_docs)) > len(curr_more):
86
+ addition_docs = [i for i, d in enumerate(docs) if i not in curr_docs + curr_more]
87
+ all_docs = curr_docs + curr_more + random.sample(addition_docs, max(0, num_docs - len(curr_docs) - len(curr_more)))
88
+ else:
89
+ all_docs = curr_docs + random.sample(curr_more, num_docs - len(curr_docs))
90
+
91
+ all_docs = [docs[idx] for idx in all_docs]
92
+ else:
93
+ all_docs = docs
94
+
95
+ random.Random(random_seed).shuffle(all_docs)
96
+
97
+ context = '\n\n'.join([DOCUMENT_PROMPT.format(i=i+1, document=d) for i, d in enumerate(all_docs)])
98
+ input_text = template.format(
99
+ context=context,
100
+ query=curr_q
101
+ )
102
+ return input_text, curr_a
103
+
104
+
105
+ # The following code has been modified from the original source from:
106
+ # https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
107
+ # under the same Apache 2.0 license included above.
108
+
109
+
110
+ def _text_to_tokens(text: str) -> List[int]:
111
+ return re.split(r"\s+", text.strip())
112
+
113
+
114
+ def generate_samples(dataset: str, dataset_path: str, template: str, random_seed: int, pre_samples: int, num_samples: int, tokens_to_generate: int, max_seq_length: int, incremental: int = 10, remove_newline_tab: bool = False):
115
+ random.seed(random_seed)
116
+ np.random.seed(random_seed)
117
+
118
+ if dataset == 'squad':
119
+ qas, docs = read_squad(dataset_path)
120
+ elif dataset == 'hotpotqa':
121
+ qas, docs = read_hotpotqa(dataset_path)
122
+ else:
123
+ raise NotImplementedError(f'{dataset} is not implemented.')
124
+
125
+ write_jsons = []
126
+ tokens_to_generate = tokens_to_generate
127
+
128
+ # Find the perfect num_docs
129
+ num_docs = incremental
130
+
131
+ total_tokens = 0 # Track the total tokens generated for this example
132
+ while total_tokens + tokens_to_generate < max_seq_length :
133
+ input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
134
+ # Calculate the number of tokens in the example
135
+ total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
136
+ # print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
137
+ if total_tokens + tokens_to_generate > max_seq_length:
138
+ num_docs -= incremental
139
+ break
140
+
141
+ num_docs += incremental
142
+ if num_docs > len(docs):
143
+ num_docs = len(docs)
144
+ break
145
+ # print('Number of documents:', num_docs)
146
+
147
+ # Generate samples
148
+ for index in tqdm(range(num_samples)):
149
+ used_docs = num_docs
150
+ while(True):
151
+ try:
152
+ input_text, answer = generate_input_output(index + pre_samples, used_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
153
+ length = len(_text_to_tokens(input_text)) + tokens_to_generate
154
+ assert length <= max_seq_length, f"{length} exceeds max_seq_length."
155
+ break
156
+ except:
157
+ if used_docs > incremental:
158
+ used_docs -= incremental
159
+
160
+ if remove_newline_tab:
161
+ input_text = ' '.join(input_text.replace('\n', ' ').replace('\t', ' ').strip().split())
162
+
163
+ formatted_output = {
164
+ "index": index,
165
+ "input": input_text,
166
+ "outputs": answer,
167
+ "length": length
168
+ }
169
+ write_jsons.append(formatted_output)
170
+
171
+ return write_jsons
@@ -0,0 +1,88 @@
1
+ import os
2
+ from typing import List
3
+
4
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
5
+ from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples # type: ignore
6
+ from helm.benchmark.scenarios.scenario import (
7
+ VALID_SPLIT,
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ )
15
+
16
+
17
+ _DATASET_TO_URL = {
18
+ "hotpotqa": "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json",
19
+ "squad": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
20
+ }
21
+
22
+
23
+ class _RULERQAScenario(Scenario):
24
+ name = "ruler_qa"
25
+ description = "A QA scenario from Ruler"
26
+ tags = ["long_context", "rag"]
27
+
28
+ _TEMPLATE = """Answer the question based on the given documents. Only give me the answer and do not output any other words.
29
+
30
+ The following are given documents.
31
+
32
+ {context}
33
+
34
+ Answer the question based on the given documents. Only give me the answer and do not output any other words.
35
+
36
+ Question: {query} Answer:""" # noqa: E501
37
+
38
+ def __init__(self, dataset: str, max_num_words: int):
39
+ super().__init__()
40
+ self.dataset = dataset or "hotpotqa"
41
+ self.max_num_words = max_num_words
42
+
43
+ def get_instances(self, output_path: str) -> List[Instance]:
44
+ data_dir = os.path.join(output_path, "data")
45
+ ensure_directory_exists(data_dir)
46
+ file_path = os.path.join(data_dir, f"{self.dataset}.json")
47
+ url = _DATASET_TO_URL[self.dataset]
48
+ ensure_file_downloaded(url, file_path)
49
+ instances: List[Instance] = []
50
+ samples = generate_samples(
51
+ dataset=self.dataset,
52
+ dataset_path=file_path,
53
+ max_seq_length=self.max_num_words,
54
+ tokens_to_generate=32,
55
+ num_samples=500,
56
+ random_seed=42,
57
+ pre_samples=0,
58
+ template=self._TEMPLATE,
59
+ )
60
+ for sample in samples:
61
+ instance = Instance(
62
+ id=sample["index"],
63
+ input=Input(text=sample["input"]),
64
+ references=[
65
+ Reference(Output(text=output_text), tags=[CORRECT_TAG]) for output_text in sample["outputs"]
66
+ ],
67
+ split=VALID_SPLIT,
68
+ )
69
+ instances.append(instance)
70
+ return instances
71
+
72
+
73
+ class RULERHotpotQAScenario(_RULERQAScenario):
74
+ name = "ruler_hotpotqa"
75
+ description = "RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario." # noqa: E501
76
+ tags = ["long_context", "rag"]
77
+
78
+ def __init__(self, max_num_words: int):
79
+ super().__init__("hotpotqa", max_num_words)
80
+
81
+
82
+ class RULERSQuADScenario(_RULERQAScenario):
83
+ name = "ruler_squad"
84
+ description = "RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario." # noqa: E501
85
+ tags = ["long_context", "rag"]
86
+
87
+ def __init__(self, max_num_words: int):
88
+ super().__init__("squad", max_num_words)
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass, field, replace
3
- from typing import List, Optional, Tuple
3
+ from typing import Dict, List, Optional, Tuple, Any
4
4
  import os
5
5
  from pathlib import PurePath
6
6
  import inspect
@@ -67,6 +67,11 @@ class Input:
67
67
  multimedia_content: Optional[MultimediaObject] = None
68
68
  """A single input can consists of multimodal content interleaved (e.g., text, image, text, ...)."""
69
69
 
70
+ messages: Optional[List[Dict[str, str]]] = None
71
+ """Used for chat models.
72
+ If messages is specified for a chat model, the prompt is ignored.
73
+ Otherwise, the client should convert the prompt into a message."""
74
+
70
75
 
71
76
  @dataclass(frozen=True)
72
77
  class PassageQuestionInput(Input):
@@ -153,6 +158,9 @@ class Instance:
153
158
  contrast_references: Optional[List[List[Reference]]] = None
154
159
  """References for the perturbed input above (if available)"""
155
160
 
161
+ extra_data: Optional[Dict[str, Any]] = None
162
+ """Extra data required by the scenario e.g. chain-of-thought annotations"""
163
+
156
164
  @property
157
165
  def first_correct_reference(self) -> Optional[Reference]:
158
166
  """Return the first correct reference."""
@@ -19,7 +19,7 @@ from helm.benchmark.scenarios.scenario import (
19
19
  from helm.common.general import ensure_file_downloaded
20
20
  from helm.common.hierarchical_logger import hlog
21
21
 
22
- # BHASA Scenarios
22
+ # SEA-HELM Scenarios
23
23
  # A. Natural Language Understanding
24
24
  # B. Natural Language Generation
25
25
  # C. Natural Language Reasoning
@@ -95,7 +95,12 @@ class TyDiQAScenario(Scenario):
95
95
  self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
96
96
 
97
97
  def get_instances(self, output_path) -> List[Instance]:
98
- dataset = datasets.load_dataset("khalidalt/tydiqa-goldp", "indonesian")
98
+ dataset = datasets.load_dataset(
99
+ "khalidalt/tydiqa-goldp",
100
+ "indonesian",
101
+ revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
102
+ trust_remote_code=True,
103
+ )
99
104
 
100
105
  outputs = []
101
106
  for split in self.splits.keys():
@@ -3,7 +3,7 @@ import os
3
3
  from typing import List
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
- from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
6
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
7
7
 
8
8
 
9
9
  class SelfInstructScenario(Scenario):