crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,127 @@
1
+ from typing import Dict, List
2
+ from datasets import load_dataset
3
+
4
+ from helm.common.hierarchical_logger import hlog
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ PassageQuestionInput,
12
+ Output,
13
+ )
14
+
15
+
16
+ class MedCalcBenchScenario(Scenario):
17
+ """
18
+ MedCalc-Bench is the first medical calculation dataset used to benchmark
19
+ LLMs ability to serve as clinical calculators.
20
+ Each instance in the dataset consists of a patient note, a question asking
21
+ to compute a specific clinical value, a final answer value, and a step-by-step
22
+ solution explaining how the final answer was obtained. Our dataset covers 55
23
+ different calculation tasks. We hope this dataset serves as a call to improve
24
+ the verbal and computational reasoning skills of LLMs in medical settings.
25
+
26
+ This dataset contains a training dataset of 10,053 instances and a testing
27
+ dataset of 1,047 instances.
28
+
29
+ Dataset: https://huggingface.co/datasets/ncbi/MedCalc-Bench-v1.0
30
+ Paper: https://arxiv.org/abs/2406.12036
31
+
32
+ Sample Prompt:
33
+ Given a patient note and a clinical question, compute the requested medical value.
34
+ Be as concise as possible.
35
+
36
+ Patient note: A 70-year-old female was rushed into the ICU due to respiratory distress,
37
+ following which she was promptly put on mechanical ventilation. Her delivered oxygen fell
38
+ to 51 % FiO₂; meanwhile, her partial pressure of oxygen (PaO₂) registered at 74 mm Hg.
39
+ She was conscious but visibly disoriented with a functional Glasgow Coma Score of 12.
40
+ She was hypotensive with blood pressure of 91/70 mm Hg. Multiple vasopressors are being administered
41
+ simultaneously including DOPamine at 4 mcg/kg/min, norEPINEPHrine at 0.06 mcg/kg/min,
42
+ DOBUTamine at 3 mcg/kg/min, and EPINEPHrine at 0.03 mcg/kg/min. Laboratory evaluations
43
+ revealed mild renal impairment with creatinine levels slightly elevated at 1.6 mg/dL
44
+ and a bilirubin level of 1.9 mg/dL. Her platelet count was found to be 165,000/µL.
45
+ Her daily urine output of 950 mL.
46
+ Question: What is the patient's Sequential Organ Failure Assessment (SOFA) Score?
47
+
48
+ Answer:
49
+
50
+ @misc{khandekar2024medcalcbench,
51
+ title={MedCalc-Bench: Evaluating Large Language Models for Medical Calculations},
52
+ author={
53
+ Nikhil Khandekar and Qiao Jin and Guangzhi Xiong and Soren Dunn and Serina S Applebaum and
54
+ Zain Anwar and Maame Sarfo-Gyamfi and Conrad W Safranek and Abid A Anwar and Andrew Zhang and
55
+ Aidan Gilson and Maxwell B Singer and Amisha Dave and Andrew Taylor and Aidong Zhang and
56
+ Qingyu Chen and Zhiyong Lu
57
+ },
58
+ year={2024},
59
+ eprint={2406.12036},
60
+ archivePrefix={arXiv},
61
+ primaryClass={
62
+ id='cs.CL' full_name='Computation and Language' is_active=True alt_name='cmp-lg'
63
+ in_archive='cs' is_general=False description='Covers natural language processing.
64
+ Roughly includes material in ACM Subject Class I.2.7. Note that work on artificial
65
+ languages (programming languages, logics, formal systems) that does not explicitly
66
+ address natural-language issues broadly construed (natural-language processing, computational
67
+ linguistics, speech, text retrieval, etc.) is not appropriate for this area.'
68
+ }
69
+ }
70
+ """
71
+
72
+ name = "medcalc_bench"
73
+ description = (
74
+ "MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute"
75
+ "clinically relevant values from patient notes. Each instance consists of a clinical note"
76
+ "describing the patient's condition, a diagnostic question targeting a specific medical"
77
+ "value, and a ground truth response."
78
+ )
79
+ tags = ["knowledge", "reasoning", "biomedical"]
80
+
81
+ def __init__(self):
82
+ super().__init__()
83
+
84
+ def process_csv(self, data, split: str) -> List[Instance]:
85
+ instances: List[Instance] = []
86
+ hlog(f"Processing data for {split} split")
87
+ for row in data:
88
+ question = row["Question"]
89
+ ground_truth_answer = row["Ground Truth Answer"]
90
+ patient_note = row["Patient Note"]
91
+ id = row["Row Number"]
92
+
93
+ prompt = PassageQuestionInput(
94
+ passage=patient_note + "\n", question=question + "\n", passage_prefix="Patient note: "
95
+ )
96
+
97
+ extra_data = {
98
+ "category": row["Category"],
99
+ "upper_limit": row["Upper Limit"],
100
+ "lower_limit": row["Lower Limit"],
101
+ }
102
+
103
+ instance = Instance(
104
+ input=prompt,
105
+ references=[Reference(Output(text=ground_truth_answer), tags=[CORRECT_TAG])],
106
+ extra_data=extra_data,
107
+ split=split,
108
+ id=id,
109
+ )
110
+ instances.append(instance)
111
+ return instances
112
+
113
+ def get_instances(self, output_path: str) -> List[Instance]:
114
+ # Load the MedCalc-Bench dataset from Hugging Face
115
+ dataset = load_dataset("ncbi/MedCalc-Bench-v1.0")
116
+
117
+ # Process all the instances - limit to zero shot setting
118
+ instances: List[Instance] = []
119
+ splits: Dict[str, str] = {
120
+ # "train": TRAIN_SPLIT,
121
+ "test": TEST_SPLIT,
122
+ }
123
+ for hf_split, split in splits.items():
124
+ data = dataset[hf_split]
125
+ instances.extend(self.process_csv(data, split))
126
+
127
+ return instances
@@ -0,0 +1,125 @@
1
+ import csv
2
+ import os
3
+ from typing import List
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ CORRECT_TAG,
9
+ TEST_SPLIT,
10
+ Input,
11
+ Output,
12
+ )
13
+ from helm.common.general import ensure_file_downloaded
14
+
15
+
16
+ class MedecScenario(Scenario):
17
+ """
18
+ Processes the MEDEC dataset for medical error detection and correction tasks.
19
+
20
+ MEDEC is the first publicly available benchmark for medical error detection and correction in clinical notes,
21
+ introduced in "Ben Abacha et al., 2024." The dataset includes 3,848 clinical texts from the MS and UW collections,
22
+ covering five types of errors:
23
+ - Diagnosis
24
+ - Management
25
+ - Treatment
26
+ - Pharmacotherapy
27
+ - Causal Organism
28
+
29
+ The dataset consists of:
30
+ - Training Set: 2,189 MS texts
31
+ - Validation Set: 574 MS texts and 160 UW texts
32
+ - Test Set: 597 MS texts and 328 UW texts
33
+
34
+ Each clinical text is labeled as either correct or containing one error. The task involves:
35
+ (A) Predicting the error flag (1: the text contains an error, 0: the text has no errors).
36
+ (B) For flagged texts, extracting the sentence that contains the error.
37
+ (C) Generating a corrected sentence.
38
+
39
+ The MEDEC dataset was used for the MEDIQA-CORR shared task to evaluate seventeen participating systems.
40
+ Recent LLMs (e.g., GPT-4, Claude 3.5 Sonnet, Gemini 2.0 Flash) have been evaluated on this dataset, showing good
41
+ performance but still lagging behind medical doctors in error detection and correction tasks.
42
+
43
+ Task:
44
+ Given a clinical text, models must identify errors and correct them while demonstrating medical knowledge
45
+ and reasoning capabilities.
46
+ """
47
+
48
+ GIT_HASH = "49c59dcba43a7af8717590a405b11a57f42b04df"
49
+ TRAIN_URL = f"https://raw.githubusercontent.com/abachaa/MEDEC/{GIT_HASH}/MEDEC-MS/MEDEC-Full-TrainingSet-with-ErrorType.csv" # noqa: E501
50
+ TEST_URL = f"https://raw.githubusercontent.com/abachaa/MEDEC/{GIT_HASH}/MEDEC-MS/MEDEC-MS-TestSet-with-GroundTruth-and-ErrorType.csv" # noqa: E501
51
+
52
+ name = "medec"
53
+ description = (
54
+ "Medec is a benchmark composed of clinical narratives that include either correct"
55
+ "documentation or medical errors. Each entry includes sentence-level identifiers and an"
56
+ "associated correction task. The model must review the narrative and either identify"
57
+ "the erroneous sentence and correct it, or confirm that the text is entirely accurate."
58
+ )
59
+ tags = ["error_detection", "error_correction", "biomedical"]
60
+
61
+ def download_csv(self, url: str, output_path: str, file_name: str) -> str:
62
+ """Download the CSV file and return its path."""
63
+ csv_path = os.path.join(output_path, file_name)
64
+ ensure_file_downloaded(source_url=url, target_path=csv_path, unpack=False)
65
+ return csv_path
66
+
67
+ def process_csv(self, csv_path: str, split: str) -> List[Instance]:
68
+ """Read and process a CSV file to generate instances."""
69
+ instances: List[Instance] = []
70
+ with open(csv_path, "r", encoding="utf-8") as f:
71
+ reader = csv.DictReader(f)
72
+
73
+ for row in reader:
74
+ # Ensure required fields exist
75
+ if not row.get("Sentences"):
76
+ continue
77
+
78
+ # Extract text content
79
+ input_text = row["Sentences"].strip()
80
+
81
+ # Extract metadata
82
+ error_flag = int(row.get("Error Flag", 0))
83
+ error_sentence_id = row.get("Error Sentence ID", "-1").strip()
84
+ corrected_sentence = row.get("Corrected Sentence", "").strip()
85
+
86
+ references = []
87
+ if error_flag == 1 and corrected_sentence != "NA" and error_sentence_id != "-1":
88
+ # If there is an error, store the corrected sentence
89
+ references.append(
90
+ Reference(
91
+ Output(text=f"{error_sentence_id} {corrected_sentence}"),
92
+ tags=[CORRECT_TAG],
93
+ )
94
+ )
95
+ else:
96
+ # If no error exists, expect "CORRECT" as the output
97
+ references.append(
98
+ Reference(
99
+ Output(text="CORRECT"),
100
+ tags=[CORRECT_TAG],
101
+ )
102
+ )
103
+
104
+ instance = Instance(
105
+ input=Input(text=input_text),
106
+ references=references,
107
+ split=split,
108
+ )
109
+ instances.append(instance)
110
+
111
+ return instances
112
+
113
+ def get_instances(self, output_path: str) -> List[Instance]:
114
+ """Download and process the dataset to generate instances."""
115
+ instances: List[Instance] = []
116
+
117
+ # Download and process the training set - limit to zero shot setting for now
118
+ # train_csv = self.download_csv(self.TRAIN_URL, output_path, "medec_train.csv")
119
+ # instances.extend(self.process_csv(train_csv, TRAIN_SPLIT))
120
+
121
+ # Download and process the test set
122
+ test_csv = self.download_csv(self.TEST_URL, output_path, "medec_test.csv")
123
+ instances.extend(self.process_csv(test_csv, TEST_SPLIT))
124
+
125
+ return instances
@@ -0,0 +1,72 @@
1
+ from typing import List
2
+ from datasets import load_dataset
3
+
4
+ from helm.benchmark.scenarios.scenario import (
5
+ Scenario,
6
+ Instance,
7
+ Reference,
8
+ TEST_SPLIT,
9
+ CORRECT_TAG,
10
+ Output,
11
+ Input,
12
+ )
13
+
14
+
15
+ class MedHalluScenario(Scenario):
16
+ """
17
+ MedHallu is a medical hallucination dataset that consists of PubMed articles and associated questions,
18
+ with the objective being to classify whether the answer is factual or hallucinated.
19
+ MedHallu: https://medhallu.github.io/
20
+ """
21
+
22
+ name = "medhallu"
23
+ description = (
24
+ "MedHallu is a benchmark focused on evaluating factual correctness in biomedical"
25
+ "question answering. Each instance contains a PubMed-derived knowledge snippet, a"
26
+ "biomedical question, and a model-generated answer. The task is to classify whether the"
27
+ "answer is factually correct or contains hallucinated (non-grounded) information. This"
28
+ "benchmark is designed to assess the factual reliability of medical language models."
29
+ )
30
+ tags = ["knowledge", "reasoning", "biomedical"]
31
+
32
+ def create_instance(self, question, knowledge, answer, label, split):
33
+ prompt_text = f"""World Knowledge: {knowledge}
34
+
35
+ Question: {question}
36
+
37
+ Answer: {answer}
38
+ """
39
+ return Instance(
40
+ input=Input(
41
+ text=prompt_text,
42
+ ),
43
+ references=[Reference(Output(text=label), tags=[CORRECT_TAG])],
44
+ split=split,
45
+ )
46
+
47
+ def get_instances(self, output_path: str) -> List[Instance]:
48
+ # Load the MedCalc-Bench dataset from Hugging Face
49
+ dataset = load_dataset(
50
+ "UTAustin-AIHealth/MedHallu",
51
+ "pqa_labeled",
52
+ split="train",
53
+ revision="515060458a945c633debc6fd5baac7764416b724",
54
+ )
55
+
56
+ # Process all the instances - limit to zero shot setting
57
+ instances: List[Instance] = []
58
+ for row in dataset:
59
+ # print("printing row", row)
60
+ question = row["Question"]
61
+ ground_truth_answer = row["Ground Truth"]
62
+ knowledge = row["Knowledge"]
63
+ hallucinated_answer = row["Hallucinated Answer"]
64
+
65
+ gt_instance = self.create_instance(question, knowledge, ground_truth_answer, label="0", split=TEST_SPLIT)
66
+ instances.append(gt_instance)
67
+
68
+ hallucinated_instance = self.create_instance(
69
+ question, knowledge, hallucinated_answer, label="1", split=TEST_SPLIT
70
+ )
71
+ instances.append(hallucinated_instance)
72
+ return instances
@@ -0,0 +1,111 @@
1
+ from typing import Dict, List
2
+ from datasets import load_dataset
3
+
4
+ from helm.common.hierarchical_logger import hlog
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+
15
+
16
+ class MediQAScenario(Scenario):
17
+ """
18
+ MEDIQA-QA is a dataset designed to benchmark large language models (LLMs) on medical
19
+ question answering (QA) tasks.
20
+ Each instance in the dataset includes a medical question, a set of candidate answers,
21
+ relevance annotations for ranking, and additional context to evaluate understanding
22
+ and retrieval capabilities in a healthcare setting.
23
+
24
+ The dataset encompasses diverse question types, including consumer health queries
25
+ and clinical questions, making it suitable for assessing LLMs' ability to answer
26
+ consumer healthcare questions.
27
+
28
+ This dataset comprises two training sets of 104 instances each, a validation set
29
+ of 25 instances, and a testing set of 150 instances.
30
+
31
+ Dataset: https://huggingface.co/datasets/bigbio/mediqa_qa
32
+ Paper: https://aclanthology.org/W19-5039/
33
+
34
+ Sample Prompt:
35
+ Answer the following consumer health question.
36
+
37
+ Question: Noonan syndrome. What are the references with noonan syndrome
38
+ and polycystic renal disease?
39
+ Answer:
40
+
41
+ @inproceedings{MEDIQA2019,
42
+ author = {Asma {Ben Abacha} and Chaitanya Shivade and Dina Demner{-}Fushman},
43
+ title = {Overview of the MEDIQA 2019 Shared Task on Textual Inference,
44
+ Question Entailment and Question Answering},
45
+ booktitle = {ACL-BioNLP 2019},
46
+ year = {2019}
47
+ }
48
+ """
49
+
50
+ name = "medi_qa"
51
+ description = (
52
+ "MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate"
53
+ "medically accurate answers to patient-generated questions. Each instance includes a"
54
+ "consumer health question, a set of candidate answers (used in ranking tasks), relevance"
55
+ "annotations, and optionally, additional context. The benchmark focuses on supporting"
56
+ "patient understanding and accessibility in health communication."
57
+ )
58
+ tags = ["knowledge", "biomedical"]
59
+
60
+ def _get_highest_ranked_answer(self, answers: List[Dict[str, Dict[str, str]]]) -> str:
61
+ best_answer: str = ""
62
+ for answer in answers:
63
+ if answer["Answer"]["ReferenceRank"] != 1:
64
+ continue
65
+ best_answer = answer["Answer"]["AnswerText"]
66
+ break
67
+ return best_answer
68
+
69
+ def process_csv(self, data, split: str) -> List[Instance]:
70
+ instances: List[Instance] = []
71
+ hlog(f"Processing data for {split} split")
72
+ total_tokens: int = 0
73
+ counter = 0
74
+ for row in data:
75
+ row = row["QUESTION"]
76
+ question = row["QuestionText"]
77
+ ground_truth_answer = self._get_highest_ranked_answer(row["AnswerList"])
78
+ id = row["QID"]
79
+ counter += 1
80
+ total_tokens += len(ground_truth_answer.split())
81
+ instances.append(
82
+ Instance(
83
+ input=Input(question),
84
+ references=[Reference(Output(ground_truth_answer), tags=[CORRECT_TAG])],
85
+ split=split,
86
+ id=id,
87
+ )
88
+ )
89
+ return instances
90
+
91
+ def get_instances(self, output_path: str) -> List[Instance]:
92
+ # Load the MEDIQA dataset from Hugging Face
93
+ dataset = load_dataset(
94
+ "bigbio/mediqa_qa",
95
+ trust_remote_code=True,
96
+ revision="9288641f4c785c95dc9079fa526dabb12efdb041",
97
+ )
98
+
99
+ # Process all the instances
100
+ instances: List[Instance] = []
101
+ # Limit to zero shot setting
102
+ splits: Dict[str, str] = {
103
+ # "train_live_qa_med": TRAIN_SPLIT,
104
+ # "validation": VALID_SPLIT,
105
+ "test": TEST_SPLIT,
106
+ }
107
+ for hf_split, split in splits.items():
108
+ data = dataset[hf_split]
109
+ instances.extend(self.process_csv(data, split))
110
+
111
+ return instances
@@ -5,7 +5,7 @@ import pandas as pd
5
5
 
6
6
  from helm.common.general import ensure_file_downloaded
7
7
 
8
- from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
8
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
9
9
 
10
10
 
11
11
  class MedicationQAScenario(Scenario):
@@ -31,7 +31,13 @@ class MedicationQAScenario(Scenario):
31
31
  FILENAME = "MedInfo2019-QA-Medications.xlsx"
32
32
 
33
33
  name = "medication_qa"
34
- description = "MedInfo 2019 MedicationQA medication question answering task"
34
+ description = (
35
+ "MedicationQA is a benchmark composed of open-ended consumer health questions"
36
+ "specifically focused on medications. Each example consists of a free-form question"
37
+ "and a corresponding medically grounded answer. The benchmark evaluates a model's"
38
+ "ability to provide accurate, accessible, and informative medication-related responses"
39
+ "for a lay audience."
40
+ )
35
41
  tags = ["knowledge", "generation", "question_answering", "biomedical"]
36
42
 
37
43
  def download_medication_qa(self, path: str):
@@ -0,0 +1,171 @@
1
+ from typing import List, Optional
2
+
3
+ from datasets import load_dataset, Dataset
4
+ from helm.common.hierarchical_logger import hlog
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ VALID_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ make_rank_tag,
15
+ )
16
+
17
+
18
+ class MELTInformationRetrievalScenario(Scenario):
19
+ name = "melt_information_retrieval"
20
+ description = "Scenario for information retrieval tasks."
21
+ tags = ["information_retrieval"]
22
+
23
+ """ Dictionary mapping task track tuples to the number of queries. """
24
+ NUM_TRAIN_QUERIES = 1000
25
+
26
+ """ Upper and lower bounds on top-k.
27
+
28
+ The top-k number represents the number of passages we will consider per
29
+ query. Max top-k for the train and validation files are set to the number
30
+ of passages included in the corresponding top-k files.
31
+ """
32
+ MIN_TOPK: int = 11
33
+ MAX_TRAIN_TOPK: int = 20
34
+ MAX_VALID_TOPK: int = 1000
35
+
36
+ def __init__(
37
+ self, dataset_name: str, revision: str, subset: Optional[str] = None, valid_topk: Optional[int] = None
38
+ ):
39
+ """The constructor for the MSMARCOScenario.
40
+
41
+ Args:
42
+ dataset_name: The name of the dataset.
43
+ revision: The revision of the dataset to use.
44
+ subset: The subset of the dataset to use. Defaults to "".
45
+ valid_topk: If set, specifies the number of top documents for which the
46
+ validation instances will be created. Must be in the range
47
+ [self.MIN_TOPK, self.MAX_VALID_TOPK].
48
+ """
49
+ super().__init__()
50
+
51
+ # Input validation
52
+ self.dataset_name = dataset_name
53
+ self.revision = revision
54
+ self.subset = subset
55
+ self.valid_topk: Optional[int] = valid_topk
56
+ if self.valid_topk is not None:
57
+ assert valid_topk and self.MIN_TOPK <= valid_topk <= self.MAX_VALID_TOPK
58
+
59
+ def get_train_instances(self) -> List[Instance]:
60
+ """Get training instances.
61
+ References for each instance are selected as follows:
62
+ 1. We select 1 correct reference, where the documents included
63
+ corresponds to the best document for the given train query.
64
+ 2. We create 1 wrong reference, where the document included
65
+ corresponds to a non-gold document for the given train query.
66
+ """
67
+ dataset = load_dataset(
68
+ self.dataset_name,
69
+ self.subset,
70
+ revision=self.revision,
71
+ trust_remote_code=True,
72
+ )
73
+ instances = []
74
+ for i, sample in enumerate(dataset["train"]):
75
+
76
+ if i >= self.NUM_TRAIN_QUERIES:
77
+ break
78
+
79
+ references = [
80
+ Reference(Output(text=sample["positive"]), tags=[CORRECT_TAG]),
81
+ Reference(Output(text=sample["negative"]), tags=[]),
82
+ ]
83
+
84
+ instances.append(Instance(Input(text=sample["query"]), references=references, split=TRAIN_SPLIT))
85
+ return instances
86
+
87
+ def get_valid_instances(self) -> List[Instance]:
88
+ """Get validation instances.
89
+ By default, we create a reference for each Document ID for which there
90
+ is a judgment with respect to the provided Query ID.
91
+
92
+ If self.valid_topk is not None, we ensure that a reference is created
93
+ for all the documents that appear in top self.valid_topk documents for
94
+ the given validation query.
95
+ """
96
+ dataset = load_dataset(
97
+ self.dataset_name,
98
+ f"runs-{self.subset}",
99
+ revision=self.revision,
100
+ trust_remote_code=True,
101
+ )
102
+ instances = []
103
+ for sample in dataset["bm25"]:
104
+ references = []
105
+
106
+ for k, passage_dict in enumerate(Dataset.from_dict(sample["passages"])):
107
+ if self.valid_topk is None or k >= self.valid_topk:
108
+ break
109
+ tags = []
110
+ tags.append(f"docid={passage_dict['id']}")
111
+ if k == 0:
112
+ tags.append(CORRECT_TAG)
113
+ tags.append(make_rank_tag(rank=k + 1)) # Top-k rank
114
+ references.append(Reference(Output(text=passage_dict["passage"]), tags=tags))
115
+
116
+ instances.append(Instance(Input(text=sample["query"]), references=references, split=VALID_SPLIT))
117
+
118
+ return instances
119
+
120
+ def get_instances(self, output_path: str) -> List[Instance]:
121
+ """Get instances for this scenario.
122
+
123
+ Refer to the documentation of the following methods for details on how
124
+ the instances are created:
125
+ * self.get_train_instances
126
+ * self.get_valid_instances
127
+ """
128
+
129
+ hlog("Preparing training instances.")
130
+ train_instances = self.get_train_instances()
131
+
132
+ hlog("Preparing validation instances.")
133
+ valid_instances = self.get_valid_instances()
134
+
135
+ return train_instances + valid_instances
136
+
137
+
138
+ class MELTInformationRetrievalMMARCOScenario(MELTInformationRetrievalScenario):
139
+ """
140
+ Scenario for the MMARCO dataset.
141
+ """
142
+
143
+ name = "melt_information_retrieval_mmarco"
144
+ description = "MMARCO dataset for information retrieval in Vietnamese."
145
+ tags = ["information_retrieval"]
146
+
147
+ def __init__(self, **kwargs):
148
+ super().__init__(
149
+ dataset_name="unicamp-dl/mmarco",
150
+ revision="6d039c4638c0ba3e46a9cb7b498b145e7edc6230",
151
+ subset="vietnamese",
152
+ **kwargs,
153
+ )
154
+
155
+
156
+ class MELTInformationRetrievalMRobustScenario(MELTInformationRetrievalScenario):
157
+ """
158
+ Scenario for the MRobust dataset.
159
+ """
160
+
161
+ name = "melt_information_retrieval_mrobust"
162
+ description = "MRobust dataset for information retrieval in Vietnamese."
163
+ tags = ["information_retrieval"]
164
+
165
+ def __init__(self, **kwargs):
166
+ super().__init__(
167
+ dataset_name="unicamp-dl/mrobust",
168
+ revision="fda452a7fbfd9550db2f78d9d98e6b3ec16734df",
169
+ subset="vietnamese",
170
+ **kwargs,
171
+ )