crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,277 @@
1
+ import os
2
+ import re
3
+
4
+ from typing import Any, Dict, List, Tuple, Optional
5
+ import xml.etree.ElementTree as ET
6
+
7
+ from helm.common.general import ensure_directory_exists
8
+ from helm.benchmark.scenarios.scenario import (
9
+ TEST_SPLIT,
10
+ TRAIN_SPLIT,
11
+ Input,
12
+ Scenario,
13
+ Instance,
14
+ CORRECT_TAG,
15
+ Reference,
16
+ Output,
17
+ )
18
+
19
+ ORIGINAL_DEFINITIONS = {
20
+ "ABDOMINAL": "History of intra-abdominal surgery, small or large intestine resection, or small bowel obstruction",
21
+ "ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define “advanced” \
22
+ as having 2 or more of the following: • Taking 2 or more medications to treat CAD • \
23
+ History of myocardial infarction (MI) • Currently experiencing angina • Ischemia, past or present",
24
+ "ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
25
+ "ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)",
26
+ "CREATININE": "Serum creatinine level above the upper normal limit",
27
+ "DIETSUPP-2MOS": "Taken a dietary supplement (excluding vitamin D) in the past 2 months",
28
+ "DRUG-ABUSE": "Current or past history of drug abuse",
29
+ "ENGLISH": "Patient must speak English",
30
+ "HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
31
+ "KETO-1YR": "Diagnosis of ketoacidosis within the past year",
32
+ "MAJOR-DIABETES": "Major diabetes-related complication. For the purposes of this annotation, we define \
33
+ “major complication” (as opposed to “minor complication”) as any of the following that are a result of \
34
+ (or strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • \
35
+ Retinopathy • nephropathy • neuropathy",
36
+ "MAKES-DECISIONS": "Patient must make their own medical decisions",
37
+ "MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
38
+ }
39
+ # Custom definitions for better prompts
40
+ LONG_DEFINITIONS = {
41
+ "ABDOMINAL": "History of intra-abdominal surgery. This could include any form of intra-abdominal surgery, \
42
+ including but not limited to small/large intestine resection or small bowel obstruction",
43
+ "ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define \
44
+ “advanced” as having 2 or more of the following: (a) Taking 2 or more medications to treat CAD (b) History \
45
+ of myocardial infarction (MI) (c) Currently experiencing angina (d) Ischemia, past or present. \
46
+ The patient must have at least 2 of these categories (a,b,c,d) to meet this criterion, otherwise the patient \
47
+ does not meet this criterion. For ADVANCED-CAD, be strict in your evaluation of the patient -- if they just \
48
+ have cardiovascular disease, then they do not meet this criterion.",
49
+ "ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
50
+ "ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)..",
51
+ "CREATININE": "Serum creatinine level above the upper normal limit",
52
+ "DIETSUPP-2MOS": "Consumption of a dietary supplement (excluding vitamin D) in the past 2 months. To assess \
53
+ this criterion, go through the list of medications_and_supplements taken from the note. If a substance could \
54
+ potentially be used as a dietary supplement (i.e. it is commonly used as a dietary supplement, even if it \
55
+ is not explicitly stated as being used as a dietary supplement), then the patient meets this criterion. \
56
+ Be lenient and broad in what is considered a dietary supplement. For example, a 'multivitamin' and \
57
+ 'calcium carbonate' should always be considered a dietary supplement if they are included in this list.",
58
+ "DRUG-ABUSE": "Current or past history of drug abuse",
59
+ "ENGLISH": "Patient speaks English. Assume that the patient speaks English, unless otherwise explicitly noted. \
60
+ If the patient's language is not mentioned in the note, then assume they speak English and thus meet \
61
+ this criteria.",
62
+ "HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
63
+ "KETO-1YR": "Diagnosis of ketoacidosis within the past year",
64
+ "MAJOR-DIABETES": "Major diabetes-related complication. Examples of “major complication” (as opposed to \
65
+ “minor complication”) include, but are not limited to, any of the following that are a result of (or \
66
+ strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • Retinopathy \
67
+ • nephropathy • neuropathy. Additionally, if multiple conditions together imply a severe case of diabetes, \
68
+ then count that as a major complication.",
69
+ "MAKES-DECISIONS": "Patient must make their own medical decisions. Assume that the patient makes their own \
70
+ medical decisions, unless otherwise explicitly noted. There is no information provided about the \
71
+ patient's ability to make their own medical decisions, then assume they do make their own decisions and \
72
+ therefore meet this criteria.\"",
73
+ "MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
74
+ }
75
+
76
+
77
+ class XMLDataLoader:
78
+ def __init__(
79
+ self, path_to_folder: str, is_convert_to_numbers=True, is_split_text=True, is_remove_excessive_new_lines=True
80
+ ):
81
+ self.path_to_folder = path_to_folder
82
+ self.is_convert_to_numbers = is_convert_to_numbers
83
+ self.is_split_text = is_split_text
84
+ self.is_remove_excessive_new_lines = is_remove_excessive_new_lines
85
+
86
+ def load_data(self) -> List[Dict[str, Any]]:
87
+ """Main function: Data loader for the XML files"""
88
+ data = []
89
+ file_names = os.listdir(self.path_to_folder)
90
+ file_names = sorted([file for file in file_names if file.endswith(".xml")])
91
+ for file_name in file_names:
92
+ file_path = os.path.join(self.path_to_folder, file_name)
93
+ text, labels = self.parse_xml(file_path)
94
+ data.append({"patient_id": file_name.replace(".xml", ""), "ehr": text, "labels": labels})
95
+
96
+ return data
97
+
98
+ @staticmethod
99
+ def get_date_of_note(patient: Dict[str, Any], note_idx: int) -> Optional[str]:
100
+ """Get date of note for patient"""
101
+ assert note_idx <= len(patient["ehr"]), f"{note_idx} out of bounds for {patient['patient_id']}"
102
+ note: str = patient["ehr"][note_idx]
103
+ match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
104
+ date = match.group(1) if match else None
105
+ if not date:
106
+ print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
107
+ return date
108
+
109
+ @staticmethod
110
+ def get_current_date_for_patient(patient: Dict[str, Any]) -> Optional[str]:
111
+ """Get most recent date visible in files for a given patient"""
112
+ most_recent_date = None
113
+ for note in patient["ehr"]:
114
+ match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
115
+ most_recent_date = match.group(1) if match else most_recent_date
116
+ if not most_recent_date:
117
+ print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
118
+ return most_recent_date
119
+
120
+ def parse_xml(self, XML_file) -> Tuple[List[str], Dict[str, str]]:
121
+ tree = ET.parse(XML_file)
122
+ root = tree.getroot()
123
+ text_content = ""
124
+ result_text: List[str] = []
125
+ tags = {}
126
+ for elem in root.iter():
127
+ if elem.tag == "TEXT":
128
+ text_content = elem.text if elem.text else ""
129
+ if self.is_remove_excessive_new_lines:
130
+ text_content = self.remove_excessive_newlines(text_content)
131
+ if self.is_split_text:
132
+ result_text = self.split_text(text_content)
133
+ else:
134
+ result_text = [text_content]
135
+ elif elem.tag == "TAGS":
136
+ tags = self.read_tags(root)
137
+ return (result_text, tags)
138
+
139
+ def read_tags(self, root) -> Dict[str, str]:
140
+ """Reads the tags from an XML file and returns a dictionary of tags"""
141
+ tags_dict = {}
142
+ for tag in root.iter("TAGS"):
143
+ for subtag in tag:
144
+ met_value = subtag.attrib.get("met")
145
+ if self.is_convert_to_numbers:
146
+ met_value = 1 if met_value == "met" else 0
147
+ tags_dict[subtag.tag] = met_value
148
+ return tags_dict
149
+
150
+ def split_text(self, text: str) -> List[str]:
151
+ split_char = "*" * 100
152
+ parts = [x.strip() for x in text.split(split_char) if x.strip() != ""]
153
+ return parts
154
+
155
+ def remove_excessive_newlines(self, text: str) -> str:
156
+ text = text.replace("\n\n\n", "\n")
157
+ return text
158
+
159
+
160
+ class N2C2CTMatchingScenario(Scenario):
161
+ """
162
+ From "Cohort selection for clinical trials: n2c2 2018 shared task track 1" (Stubbs et al. 2019).
163
+ N2C2 is a collection of 288 patients (202 train / 86 test), each with 2-5 deidentified real-world clinical notes.
164
+ We use the prompt LLM formulation from Wornow et al. (2024).
165
+
166
+ Citation
167
+ ```
168
+ @article{stubbs2019cohort,
169
+ title={Cohort selection for clinical trials: n2c2 2018 shared task track 1},
170
+ author={Stubbs, Amber and Filannino, Michele and Soysal, Ergin and Henry, Samuel and Uzuner, {\"O}zlem},
171
+ journal={Journal of the American Medical Informatics Association},
172
+ volume={26},
173
+ number={11},
174
+ pages={1163--1171},
175
+ year={2019},
176
+ publisher={Oxford University Press}
177
+ }
178
+ @article{wornow2024zero,
179
+ title={Zero-shot clinical trial patient matching with llms},
180
+ author={Wornow, Michael and Lozano, Alejandro and Dash, Dev and Jindal, Jenelle and Mahaffey, \
181
+ Kenneth W and Shah, Nigam H},
182
+ journal={NEJM AI},
183
+ pages={AIcs2400360},
184
+ year={2024},
185
+ publisher={Massachusetts Medical Society}
186
+ }
187
+ ```
188
+ """
189
+
190
+ name = "n2c2_ct_matching"
191
+ description = (
192
+ "N2C2-CT is a benchmark designed to evaluate a model's ability to match patients to"
193
+ "appropriate clinical trials based on eligibility criteria. Each example includes a clinical"
194
+ "note and a trial description. The model is tasked with determining whether the patient"
195
+ "is a valid candidate for the trial. This benchmark supports automation and decision"
196
+ "support in clinical research enrollment."
197
+ )
198
+ tags = [] # TODO
199
+
200
+ POSSIBLE_ANSWER_CHOICES: List[str] = [
201
+ "yes",
202
+ "no",
203
+ ]
204
+
205
+ def __init__(self, data_path: str, subject: str):
206
+ super().__init__()
207
+ self.subject: str = subject # specific inclusion criterion to assess
208
+ self.data_path: str = data_path
209
+ self.path_to_train_dir: str = os.path.join(self.data_path, "train/")
210
+ self.path_to_test_dir: str = os.path.join(self.data_path, "test/")
211
+
212
+ def create_prompt(self, patient: Dict[str, Any]) -> str:
213
+ # Cast None values to empty strings during string formatting, but keep the original functions returning None
214
+ notes_list = [
215
+ f"## Note #{i+1}\nDate: {XMLDataLoader.get_date_of_note(patient, i) or ''}\n{note}"
216
+ for i, note in enumerate(patient["ehr"])
217
+ ]
218
+ notes: str = ("\n" + "*" * 50 + "\n\n").join(notes_list)
219
+ current_date = XMLDataLoader.get_current_date_for_patient(patient)
220
+ prompt = f"""
221
+ # Task
222
+ Your job is to decide whether the given patient meets the inclusion criterion for a clinical trial.
223
+
224
+ # Inclusion Criterion
225
+ The inclusion criterion being assessed is: "{self.subject}".
226
+ The definition of the inclusion criterion is: "{LONG_DEFINITIONS[self.subject]}".
227
+
228
+ # Patient Clinical Notes
229
+ Below is a set of {len(patient['ehr'])} clinical notes describing the patient's current health status. \
230
+ Each note is separated by a header with the date that the note was written, as well as a long list of asterisks.
231
+
232
+ {'-' * 100}
233
+
234
+ {notes}
235
+
236
+ {'-' * 100}
237
+
238
+ # Current Date
239
+ Assume that the current date is: {current_date}
240
+
241
+ # Question
242
+ Does the patient meet the inclusion criterion "{self.subject}"?
243
+ """
244
+ return prompt
245
+
246
+ def get_instances(self, output_path: str) -> List[Instance]:
247
+ instances: List[Instance] = []
248
+ for split in ["train", "test"]:
249
+ # limit to zero shot setting
250
+ if split == "test":
251
+ path_to_data = self.path_to_train_dir if split == "train" else self.path_to_test_dir
252
+ ensure_directory_exists(path_to_data)
253
+
254
+ # Load dataset
255
+ dataloader = XMLDataLoader(path_to_data)
256
+ dataset = dataloader.load_data()
257
+
258
+ # Create instances
259
+ for patient in dataset:
260
+ is_met: bool = patient["labels"][self.subject]
261
+ correct_answer: str = "yes" if is_met else "no"
262
+
263
+ # Build `References. The possible answer choices are "yes" or "no"
264
+ references: List[Reference] = [
265
+ Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
266
+ for answer in N2C2CTMatchingScenario.POSSIBLE_ANSWER_CHOICES
267
+ ]
268
+
269
+ instances.append(
270
+ Instance(
271
+ input=Input(text=self.create_prompt(patient)),
272
+ references=references,
273
+ split=TRAIN_SPLIT if split == "train" else TEST_SPLIT,
274
+ )
275
+ )
276
+
277
+ return instances
@@ -4,7 +4,7 @@ import csv
4
4
  from typing import List, Dict
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  Reference,
@@ -10,7 +10,7 @@ from helm.common.hierarchical_logger import htrack_block, hlog
10
10
  from typing import List, Dict
11
11
 
12
12
  from helm.common.general import ensure_file_downloaded, ensure_directory_exists, asdict_without_nones
13
- from .scenario import (
13
+ from helm.benchmark.scenarios.scenario import (
14
14
  Scenario,
15
15
  Instance,
16
16
  Reference,
@@ -3,7 +3,7 @@ import json
3
3
  import random
4
4
  from typing import Dict, List, Tuple
5
5
 
6
- from .scenario import (
6
+ from helm.benchmark.scenarios.scenario import (
7
7
  Scenario,
8
8
  Instance,
9
9
  Reference,
@@ -11,11 +11,21 @@ from typing import List, Optional, Tuple, Dict
11
11
 
12
12
  from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
13
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
14
+ from helm.common.local_context import LocalContext
14
15
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
15
16
  from helm.common.authentication import Authentication
16
17
  from helm.common.optional_dependencies import handle_module_not_found_error
17
18
  from helm.proxy.services.server_service import ServerService
18
- from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
19
+ from helm.benchmark.scenarios.scenario import (
20
+ Scenario,
21
+ Instance,
22
+ Reference,
23
+ TRAIN_SPLIT,
24
+ TEST_SPLIT,
25
+ CORRECT_TAG,
26
+ Input,
27
+ Output,
28
+ )
19
29
 
20
30
  try:
21
31
  import sympy
@@ -30,7 +40,7 @@ except ModuleNotFoundError as e:
30
40
  # https://github.com/stanford-crfm/benchmarking/issues/569
31
41
  def get_test_tokenizer_service() -> TokenizerService:
32
42
  # Pointed to the default local path set in run.py (--local-path)
33
- return TokenizerService(ServerService(base_path="prod_env", root_mode=True), Authentication("test"))
43
+ return TokenizerService(LocalContext(base_path="prod_env"))
34
44
 
35
45
 
36
46
  SOLUTION_TAG: str = "solution"
@@ -0,0 +1,57 @@
1
+ from typing import List, Any
2
+ from pathlib import Path
3
+ from datasets import load_dataset
4
+
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ CORRECT_TAG,
10
+ TEST_SPLIT,
11
+ Input,
12
+ Output,
13
+ )
14
+
15
+
16
+ class OABExamsScenario(Scenario):
17
+ """
18
+ The OAB Exam is a mandatory test for anyone who wants to practice law in Brazil. The exam is composed for
19
+ an objective test with 80 multiple-choice questions covering all areas of Law and a written phase focused
20
+ on a specific legal area (e.g., Civil, Criminal, Labor Law), where candidates must draft a legal document
21
+ and answer four essay questions.
22
+
23
+ This dataset is composed by the exams that occured between 2010 and 2018.
24
+
25
+ The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/oab_exams
26
+ """
27
+
28
+ name = "oab_exams"
29
+ description = "OAB exams dataset"
30
+ tags = ["knowledge", "multiple_choice", "pt-br"]
31
+
32
+ def get_instances(self, output_path: str) -> List[Instance]:
33
+ # Download the raw data and read all the dialogues
34
+ dataset: Any
35
+ # Read all the instances
36
+ instances: List[Instance] = []
37
+ cache_dir = str(Path(output_path) / "data")
38
+
39
+ dataset = load_dataset("eduagarcia/oab_exams", cache_dir=cache_dir)
40
+ for example in dataset["train"]:
41
+ question = example["question"]
42
+ choices = example["choices"]
43
+ answer = example["answerKey"]
44
+ # Skipping every canceled question!
45
+ if example["nullified"]:
46
+ continue
47
+ answers_dict = dict(zip(choices["label"], choices["text"]))
48
+ correct_answer = answers_dict[answer]
49
+
50
+ def answer_to_reference(answer: str) -> Reference:
51
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
52
+
53
+ instance = Instance(
54
+ input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
55
+ )
56
+ instances.append(instance)
57
+ return instances
@@ -0,0 +1,53 @@
1
+ import datasets
2
+ import os
3
+ from typing import List
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ Input,
10
+ Output,
11
+ CORRECT_TAG,
12
+ )
13
+ from helm.common.general import ensure_directory_exists
14
+
15
+
16
+ class OmniMATHScenario(Scenario):
17
+ """Omni-MATH: A Universal Olympiad Level Mathematic Benchmark for Large Language Models
18
+
19
+ Omni-MATH is a comprehensive and challenging benchmark specifically designed to assess LLMs' mathematical
20
+ reasoning at the Olympiad level. The dataset focuses exclusively on Olympiad mathematics and comprises a \
21
+ vast collection of 4428 competition-level problems. These problems are meticulously categorized into 33 \
22
+ (and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced \
23
+ analysis of model performance across various mathematical disciplines and levels of complexity.."""
24
+
25
+ name = "omni_math"
26
+ description = "A Universal Olympiad Level Mathematic Benchmark for Large Language Models"
27
+ tags = ["math"]
28
+
29
+ def get_instances(self, output_path: str) -> List[Instance]:
30
+ # Get Omni-MATH from HuggingFace
31
+ cache_dir = os.path.join(output_path, "data")
32
+ ensure_directory_exists(cache_dir)
33
+ dataset = datasets.load_dataset(
34
+ "KbsdJames/Omni-MATH",
35
+ revision="40ba231d8f16e29ecd40e6407e2c8640145a8f62",
36
+ cache_dir=cache_dir,
37
+ split="test",
38
+ )
39
+ assert isinstance(dataset, datasets.Dataset)
40
+
41
+ # Read all instances
42
+ instances: List[Instance] = []
43
+ for idx, row in enumerate(dataset):
44
+
45
+ input = Input(text=row["problem"])
46
+ instance = Instance(
47
+ input=input,
48
+ references=[Reference(Output(text=row["answer"]), tags=[CORRECT_TAG])],
49
+ split=TEST_SPLIT,
50
+ )
51
+ instances.append(instance)
52
+
53
+ return instances
@@ -2,7 +2,16 @@ from typing import List, Dict, Any, DefaultDict
2
2
  from datasets import load_dataset, Dataset
3
3
  from collections import defaultdict
4
4
 
5
- from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TRAIN_SPLIT, VALID_SPLIT, Output
5
+ from helm.benchmark.scenarios.scenario import (
6
+ CORRECT_TAG,
7
+ Reference,
8
+ Scenario,
9
+ Instance,
10
+ Input,
11
+ TRAIN_SPLIT,
12
+ VALID_SPLIT,
13
+ Output,
14
+ )
6
15
 
7
16
 
8
17
  class OpenAssistantScenario(Scenario):
@@ -110,7 +119,7 @@ class OpenAssistantScenario(Scenario):
110
119
  return instances
111
120
 
112
121
  # Download the raw data from Huggingface
113
- dataset: Any = load_dataset("OpenAssistant/oasst1")
122
+ dataset: Any = load_dataset("OpenAssistant/oasst1", revision="fdf72ae0827c1cda404aff25b6603abec9e3399b")
114
123
 
115
124
  # Get the instances for each split
116
125
  train_instances = get_split_instances(dataset["train"], TRAIN_SPLIT)
@@ -0,0 +1,79 @@
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import List, Optional
5
+
6
+ import datasets
7
+ import tiktoken
8
+
9
+ from helm.benchmark.scenarios.scenario import (
10
+ CORRECT_TAG,
11
+ Output,
12
+ Reference,
13
+ Scenario,
14
+ Instance,
15
+ TEST_SPLIT,
16
+ Input,
17
+ )
18
+ from helm.common.general import ensure_directory_exists
19
+
20
+
21
+ class OpenAIMRCRScenario(Scenario):
22
+ """OpenAI MRCR scenario
23
+
24
+ OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking
25
+ an LLM's ability to distinguish between multiple needles hidden in context. This eval is
26
+ inspired by the MRCR eval first introduced by Gemini (https://arxiv.org/pdf/2409.12640v2).
27
+
28
+ The task is as follows: The model is given a long, multi-turn, synthetically generated
29
+ conversation between user and model where the user asks for a piece of writing about a topic,
30
+ e.g. "write a poem about tapirs" or "write a blog post about rocks". Hidden in this conversation
31
+ are 2, 4, or 8 identical asks, and the model is ultimately prompted to return the i-th instance
32
+ of one of those asks. For example, "Return the 2nd poem about tapirs".
33
+
34
+ Reference: https://huggingface.co/datasets/openai/mrcr"""
35
+
36
+ name = "openai_mrcr"
37
+ description = "OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2)." # noqa: E501
38
+ tags = ["long_context", "mrcr"]
39
+
40
+ NEEDLES_OPTIONS = [2, 4, 8]
41
+
42
+ def __init__(self, needles: int, max_num_words: Optional[int] = None):
43
+ super().__init__()
44
+ self.needles = needles
45
+ self.max_num_words = max_num_words
46
+ if needles not in self.NEEDLES_OPTIONS:
47
+ raise Exception(f"Needles must be one of {self.NEEDLES_OPTIONS}")
48
+ self.tokenizer = tiktoken.get_encoding("o200k_base")
49
+
50
+ def count_words(self, messages: list[dict]) -> int:
51
+ return sum([len(re.split(r"\s+", m["content"].strip())) for m in messages])
52
+
53
+ def get_instances(self, output_path: str) -> List[Instance]:
54
+ cache_dir = os.path.join(output_path, "data")
55
+ ensure_directory_exists(cache_dir)
56
+ dataset = datasets.load_dataset(
57
+ "openai/mrcr",
58
+ cache_dir=cache_dir,
59
+ split="train",
60
+ data_files=[f"{self.needles}needle.parquet"],
61
+ revision="204b0d4e8d9ca5c0a90bf942fdb2a5969094adc0",
62
+ )
63
+ instances = []
64
+ for idx, row in enumerate(dataset):
65
+ messages = json.loads(row["prompt"])
66
+ if self.max_num_words and self.count_words(messages) > self.max_num_words:
67
+ continue
68
+ input = Input(messages=messages)
69
+ references = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
70
+ instance = Instance(
71
+ id=f"{self.needles}needle{idx}",
72
+ input=input,
73
+ references=references,
74
+ split=TEST_SPLIT,
75
+ extra_data={"random_string_to_prepend": row["random_string_to_prepend"]},
76
+ )
77
+ instances.append(instance)
78
+
79
+ return instances
@@ -4,7 +4,7 @@ from typing import List, Dict
4
4
 
5
5
  from helm.common.general import ensure_file_downloaded
6
6
 
7
- from .scenario import (
7
+ from helm.benchmark.scenarios.scenario import (
8
8
  Scenario,
9
9
  Instance,
10
10
  Reference,