crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -67,8 +67,13 @@ class MedBulletsScenario(Scenario):
67
67
  "https://raw.githubusercontent.com/HanjieChen/ChallengeClinicalQA/refs/heads/main/medbullets/"
68
68
  )
69
69
 
70
- name = "medbullet"
71
- description = "A USMLE-style medical question dataset with multiple-choice answers and explanations."
70
+ name = "medbullets"
71
+ description = (
72
+ "Medbullets is a benchmark of USMLE-style medical questions designed to assess a"
73
+ "model’s ability to understand and apply clinical knowledge. Each question is accompanied"
74
+ "by a patient scenario and five multiple-choice options, similar to those found on"
75
+ "Step 2 and Step 3 on the US medical licensing exam."
76
+ )
72
77
  tags = ["reasoning", "biomedical"]
73
78
 
74
79
  # Define the possible answer choices
@@ -71,8 +71,10 @@ class MedCalcBenchScenario(Scenario):
71
71
 
72
72
  name = "medcalc_bench"
73
73
  description = (
74
- "A dataset which consists of a patient note, a question "
75
- "requesting to compute a specific medical value, and a ground truth answer."
74
+ "MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute"
75
+ "clinically relevant values from patient notes. Each instance consists of a clinical note"
76
+ "describing the patient's condition, a diagnostic question targeting a specific medical"
77
+ "value, and a ground truth response."
76
78
  )
77
79
  tags = ["knowledge", "reasoning", "biomedical"]
78
80
 
@@ -50,7 +50,12 @@ class MedecScenario(Scenario):
50
50
  TEST_URL = f"https://raw.githubusercontent.com/abachaa/MEDEC/{GIT_HASH}/MEDEC-MS/MEDEC-MS-TestSet-with-GroundTruth-and-ErrorType.csv" # noqa: E501
51
51
 
52
52
  name = "medec"
53
- description = "A dataset containing medical narratives with error detection and correction pairs."
53
+ description = (
54
+ "Medec is a benchmark composed of clinical narratives that include either correct"
55
+ "documentation or medical errors. Each entry includes sentence-level identifiers and an"
56
+ "associated correction task. The model must review the narrative and either identify"
57
+ "the erroneous sentence and correct it, or confirm that the text is entirely accurate."
58
+ )
54
59
  tags = ["error_detection", "error_correction", "biomedical"]
55
60
 
56
61
  def download_csv(self, url: str, output_path: str, file_name: str) -> str:
@@ -20,7 +20,13 @@ class MedHalluScenario(Scenario):
20
20
  """
21
21
 
22
22
  name = "medhallu"
23
- description = "A dataset of PubMed articles and associated questions, with the objective being to classify whether the answer is factual or hallucinated." # noqa: E501
23
+ description = (
24
+ "MedHallu is a benchmark focused on evaluating factual correctness in biomedical"
25
+ "question answering. Each instance contains a PubMed-derived knowledge snippet, a"
26
+ "biomedical question, and a model-generated answer. The task is to classify whether the"
27
+ "answer is factually correct or contains hallucinated (non-grounded) information. This"
28
+ "benchmark is designed to assess the factual reliability of medical language models."
29
+ )
24
30
  tags = ["knowledge", "reasoning", "biomedical"]
25
31
 
26
32
  def create_instance(self, question, knowledge, answer, label, split):
@@ -49,9 +49,11 @@ class MediQAScenario(Scenario):
49
49
 
50
50
  name = "medi_qa"
51
51
  description = (
52
- "A dataset including a medical question, a set of candidate answers,"
53
- "relevance annotations for ranking, and additional context to evaluate understanding"
54
- "and retrieval capabilities in a healthcare setting."
52
+ "MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate"
53
+ "medically accurate answers to patient-generated questions. Each instance includes a"
54
+ "consumer health question, a set of candidate answers (used in ranking tasks), relevance"
55
+ "annotations, and optionally, additional context. The benchmark focuses on supporting"
56
+ "patient understanding and accessibility in health communication."
55
57
  )
56
58
  tags = ["knowledge", "biomedical"]
57
59
 
@@ -88,7 +90,11 @@ class MediQAScenario(Scenario):
88
90
 
89
91
  def get_instances(self, output_path: str) -> List[Instance]:
90
92
  # Load the MEDIQA dataset from Hugging Face
91
- dataset = load_dataset("bigbio/mediqa_qa")
93
+ dataset = load_dataset(
94
+ "bigbio/mediqa_qa",
95
+ trust_remote_code=True,
96
+ revision="9288641f4c785c95dc9079fa526dabb12efdb041",
97
+ )
92
98
 
93
99
  # Process all the instances
94
100
  instances: List[Instance] = []
@@ -31,7 +31,13 @@ class MedicationQAScenario(Scenario):
31
31
  FILENAME = "MedInfo2019-QA-Medications.xlsx"
32
32
 
33
33
  name = "medication_qa"
34
- description = "Open text question-answer pairs regarding consumer health questions about medication."
34
+ description = (
35
+ "MedicationQA is a benchmark composed of open-ended consumer health questions"
36
+ "specifically focused on medications. Each example consists of a free-form question"
37
+ "and a corresponding medically grounded answer. The benchmark evaluates a model's"
38
+ "ability to provide accurate, accessible, and informative medication-related responses"
39
+ "for a lay audience."
40
+ )
35
41
  tags = ["knowledge", "generation", "question_answering", "biomedical"]
36
42
 
37
43
  def download_medication_qa(self, path: str):
@@ -0,0 +1,171 @@
1
+ from typing import List, Optional
2
+
3
+ from datasets import load_dataset, Dataset
4
+ from helm.common.hierarchical_logger import hlog
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TRAIN_SPLIT,
10
+ VALID_SPLIT,
11
+ CORRECT_TAG,
12
+ Input,
13
+ Output,
14
+ make_rank_tag,
15
+ )
16
+
17
+
18
+ class MELTInformationRetrievalScenario(Scenario):
19
+ name = "melt_information_retrieval"
20
+ description = "Scenario for information retrieval tasks."
21
+ tags = ["information_retrieval"]
22
+
23
+ """ Dictionary mapping task track tuples to the number of queries. """
24
+ NUM_TRAIN_QUERIES = 1000
25
+
26
+ """ Upper and lower bounds on top-k.
27
+
28
+ The top-k number represents the number of passages we will consider per
29
+ query. Max top-k for the train and validation files are set to the number
30
+ of passages included in the corresponding top-k files.
31
+ """
32
+ MIN_TOPK: int = 11
33
+ MAX_TRAIN_TOPK: int = 20
34
+ MAX_VALID_TOPK: int = 1000
35
+
36
+ def __init__(
37
+ self, dataset_name: str, revision: str, subset: Optional[str] = None, valid_topk: Optional[int] = None
38
+ ):
39
+ """The constructor for the MSMARCOScenario.
40
+
41
+ Args:
42
+ dataset_name: The name of the dataset.
43
+ revision: The revision of the dataset to use.
44
+ subset: The subset of the dataset to use. Defaults to "".
45
+ valid_topk: If set, specifies the number of top documents for which the
46
+ validation instances will be created. Must be in the range
47
+ [self.MIN_TOPK, self.MAX_VALID_TOPK].
48
+ """
49
+ super().__init__()
50
+
51
+ # Input validation
52
+ self.dataset_name = dataset_name
53
+ self.revision = revision
54
+ self.subset = subset
55
+ self.valid_topk: Optional[int] = valid_topk
56
+ if self.valid_topk is not None:
57
+ assert valid_topk and self.MIN_TOPK <= valid_topk <= self.MAX_VALID_TOPK
58
+
59
+ def get_train_instances(self) -> List[Instance]:
60
+ """Get training instances.
61
+ References for each instance are selected as follows:
62
+ 1. We select 1 correct reference, where the documents included
63
+ corresponds to the best document for the given train query.
64
+ 2. We create 1 wrong reference, where the document included
65
+ corresponds to a non-gold document for the given train query.
66
+ """
67
+ dataset = load_dataset(
68
+ self.dataset_name,
69
+ self.subset,
70
+ revision=self.revision,
71
+ trust_remote_code=True,
72
+ )
73
+ instances = []
74
+ for i, sample in enumerate(dataset["train"]):
75
+
76
+ if i >= self.NUM_TRAIN_QUERIES:
77
+ break
78
+
79
+ references = [
80
+ Reference(Output(text=sample["positive"]), tags=[CORRECT_TAG]),
81
+ Reference(Output(text=sample["negative"]), tags=[]),
82
+ ]
83
+
84
+ instances.append(Instance(Input(text=sample["query"]), references=references, split=TRAIN_SPLIT))
85
+ return instances
86
+
87
+ def get_valid_instances(self) -> List[Instance]:
88
+ """Get validation instances.
89
+ By default, we create a reference for each Document ID for which there
90
+ is a judgment with respect to the provided Query ID.
91
+
92
+ If self.valid_topk is not None, we ensure that a reference is created
93
+ for all the documents that appear in top self.valid_topk documents for
94
+ the given validation query.
95
+ """
96
+ dataset = load_dataset(
97
+ self.dataset_name,
98
+ f"runs-{self.subset}",
99
+ revision=self.revision,
100
+ trust_remote_code=True,
101
+ )
102
+ instances = []
103
+ for sample in dataset["bm25"]:
104
+ references = []
105
+
106
+ for k, passage_dict in enumerate(Dataset.from_dict(sample["passages"])):
107
+ if self.valid_topk is None or k >= self.valid_topk:
108
+ break
109
+ tags = []
110
+ tags.append(f"docid={passage_dict['id']}")
111
+ if k == 0:
112
+ tags.append(CORRECT_TAG)
113
+ tags.append(make_rank_tag(rank=k + 1)) # Top-k rank
114
+ references.append(Reference(Output(text=passage_dict["passage"]), tags=tags))
115
+
116
+ instances.append(Instance(Input(text=sample["query"]), references=references, split=VALID_SPLIT))
117
+
118
+ return instances
119
+
120
+ def get_instances(self, output_path: str) -> List[Instance]:
121
+ """Get instances for this scenario.
122
+
123
+ Refer to the documentation of the following methods for details on how
124
+ the instances are created:
125
+ * self.get_train_instances
126
+ * self.get_valid_instances
127
+ """
128
+
129
+ hlog("Preparing training instances.")
130
+ train_instances = self.get_train_instances()
131
+
132
+ hlog("Preparing validation instances.")
133
+ valid_instances = self.get_valid_instances()
134
+
135
+ return train_instances + valid_instances
136
+
137
+
138
+ class MELTInformationRetrievalMMARCOScenario(MELTInformationRetrievalScenario):
139
+ """
140
+ Scenario for the MMARCO dataset.
141
+ """
142
+
143
+ name = "melt_information_retrieval_mmarco"
144
+ description = "MMARCO dataset for information retrieval in Vietnamese."
145
+ tags = ["information_retrieval"]
146
+
147
+ def __init__(self, **kwargs):
148
+ super().__init__(
149
+ dataset_name="unicamp-dl/mmarco",
150
+ revision="6d039c4638c0ba3e46a9cb7b498b145e7edc6230",
151
+ subset="vietnamese",
152
+ **kwargs,
153
+ )
154
+
155
+
156
+ class MELTInformationRetrievalMRobustScenario(MELTInformationRetrievalScenario):
157
+ """
158
+ Scenario for the MRobust dataset.
159
+ """
160
+
161
+ name = "melt_information_retrieval_mrobust"
162
+ description = "MRobust dataset for information retrieval in Vietnamese."
163
+ tags = ["information_retrieval"]
164
+
165
+ def __init__(self, **kwargs):
166
+ super().__init__(
167
+ dataset_name="unicamp-dl/mrobust",
168
+ revision="fda452a7fbfd9550db2f78d9d98e6b3ec16734df",
169
+ subset="vietnamese",
170
+ **kwargs,
171
+ )
@@ -0,0 +1,246 @@
1
+ from abc import abstractmethod
2
+ from typing import Dict, List, Tuple, Optional
3
+
4
+ import random
5
+ from datasets import load_dataset
6
+ from helm.benchmark.scenarios.scenario import (
7
+ Scenario,
8
+ Instance,
9
+ Reference,
10
+ TRAIN_SPLIT,
11
+ TEST_SPLIT,
12
+ VALID_SPLIT,
13
+ CORRECT_TAG,
14
+ PassageQuestionInput,
15
+ Input,
16
+ Output,
17
+ )
18
+
19
+
20
+ class MELTClosedBookQAScenario(Scenario):
21
+ name = "melt_closed_book_qa"
22
+ description = "Closed Book Question Answering scenario."
23
+ tags = ["question_answering"]
24
+
25
+ def __init__(
26
+ self,
27
+ dataset_name: str,
28
+ revision: str,
29
+ subset: Optional[str] = None,
30
+ splits: Optional[Dict[str, str]] = None,
31
+ ):
32
+ """
33
+ Initializes the question answering scenario.
34
+
35
+ Args:
36
+ dataset_name: The name of the dataset.
37
+ revision: The revision of the dataset to use.
38
+ subset: The subset of the dataset to use. Defaults to "".
39
+ splits: The splits to use for the dataset. Defaults to None.
40
+ """
41
+ super().__init__()
42
+ self.dataset_name = dataset_name
43
+ self.subset = subset
44
+ self.revision = revision
45
+ self.splits = splits
46
+
47
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
48
+ """
49
+ Helper for generating instances for a split.
50
+ Args:
51
+ splits (dict): Which splits to partition the data into.
52
+ Returns:
53
+ List[Instance]: Instances from the file for the specified split.
54
+ """
55
+ instances: List[Instance] = []
56
+ dataset = load_dataset(
57
+ self.dataset_name,
58
+ self.subset,
59
+ revision=self.revision,
60
+ trust_remote_code=True,
61
+ )
62
+ for dataset_split_name, helm_split_name in splits.items():
63
+
64
+ for sample in dataset[dataset_split_name]:
65
+ instance = Instance(
66
+ input=Input(text=sample["question"]),
67
+ references=[Reference(Output(text=sample["answer"]), tags=[CORRECT_TAG])],
68
+ split=helm_split_name,
69
+ )
70
+ instances.append(instance)
71
+
72
+ return instances
73
+
74
+ def get_instances(self, output_path: str) -> List[Instance]:
75
+ if self.splits is None:
76
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
77
+ else:
78
+ splits = {}
79
+ if "train" in self.splits:
80
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
81
+ if "validation" in self.splits:
82
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
83
+ if "test" in self.splits:
84
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
85
+
86
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
87
+ return instances
88
+
89
+
90
+ class MELTKnowledgeZaloScenario(MELTClosedBookQAScenario):
91
+ """
92
+ Scenario for the Zalo dataset.
93
+ """
94
+
95
+ name = "melt_knowledge_zalo"
96
+ description = "Zalo dataset for closed-book question answering."
97
+ tags = ["question_answering", "knowledge"]
98
+
99
+ def __init__(self):
100
+ super().__init__(
101
+ dataset_name="ura-hcmut/zalo_e2eqa",
102
+ revision="63494521f4de949bfa57a5f0b79bc3ee47e635ad",
103
+ splits={
104
+ TRAIN_SPLIT: "train",
105
+ TEST_SPLIT: "test",
106
+ },
107
+ )
108
+
109
+
110
+ class MELTMultipleChoiceQAScenario(Scenario):
111
+ name = "melt_multiple_choice_qa"
112
+ description = "Multiple Choice Question Answering scenario."
113
+ tags = ["question_answering"]
114
+
115
+ def __init__(
116
+ self,
117
+ dataset_name: str,
118
+ revision: str,
119
+ subset: Optional[str] = None,
120
+ splits: Optional[Dict[str, str]] = None,
121
+ ):
122
+ """
123
+ Initializes the question answering scenario.
124
+
125
+ Args:
126
+ dataset_name: The name of the dataset.
127
+ revision: The revision of the dataset to use.
128
+ subset: The subset of the dataset to use. Defaults to "".
129
+ splits: The splits to use for the dataset. Defaults to None.
130
+ """
131
+ super().__init__()
132
+ self.dataset_name = dataset_name
133
+ self.subset = subset
134
+ self.revision = revision
135
+ self.splits = splits
136
+
137
+ @abstractmethod
138
+ def process_example(self, sample: dict) -> Tuple[Input, List[Reference]]:
139
+ """
140
+ Given an sample from the dataset, create the input text and
141
+ list of answers for the instance.
142
+ """
143
+ pass
144
+
145
+ def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
146
+ """
147
+ Helper for generating instances for a split.
148
+ Args:
149
+ splits (dict): Which splits to partition the data into.
150
+ Returns:
151
+ List[Instance]: Instances from the file for the specified split.
152
+ """
153
+ instances: List[Instance] = []
154
+ dataset = load_dataset(
155
+ self.dataset_name,
156
+ self.subset,
157
+ revision=self.revision,
158
+ trust_remote_code=True,
159
+ )
160
+ for dataset_split_name, helm_split_name in splits.items():
161
+ for sample in dataset[dataset_split_name]:
162
+ inputs, references = self.process_example(sample)
163
+ instance = Instance(
164
+ input=inputs,
165
+ references=references,
166
+ split=helm_split_name,
167
+ )
168
+ instances.append(instance)
169
+
170
+ return instances
171
+
172
+ def get_instances(self, output_path: str) -> List[Instance]:
173
+ if self.splits is None:
174
+ splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
175
+ else:
176
+ splits = {}
177
+ if "train" in self.splits:
178
+ splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
179
+ if "validation" in self.splits:
180
+ splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
181
+ if "test" in self.splits:
182
+ splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
183
+
184
+ instances: List[Instance] = self.get_instances_for_splits(splits=splits)
185
+ return instances
186
+
187
+
188
+ class MELTKnowledgeViMMRCScenario(MELTMultipleChoiceQAScenario):
189
+ """
190
+ Scenario for the ViMMRC dataset.
191
+ """
192
+
193
+ name = "melt_knowledge_vimmrc"
194
+ description = "ViMMRC dataset for multiple choice question answering."
195
+ tags = ["question_answering", "knowledge"]
196
+
197
+ def __init__(self, randomize_order: bool = False):
198
+ super().__init__(
199
+ dataset_name="ura-hcmut/ViMMRC",
200
+ revision="fe68800e37aaa84d80b1d93466b36c3fa60d8bcb",
201
+ splits={
202
+ TRAIN_SPLIT: "train",
203
+ VALID_SPLIT: "validation",
204
+ TEST_SPLIT: "test",
205
+ },
206
+ )
207
+ self.randomize_order = randomize_order
208
+ self.correct_answer_mapping = {
209
+ "A": 0,
210
+ "B": 1,
211
+ "C": 2,
212
+ "D": 3,
213
+ "E": 4,
214
+ "F": 5,
215
+ "G": 6,
216
+ "H": 7,
217
+ }
218
+ random.seed(42)
219
+
220
+ def process_example(self, sample: dict) -> Tuple[Input, List[Reference]]:
221
+ """
222
+ Given an sample from the dataset, create the input text and
223
+ list of answers for the instance.
224
+ """
225
+ inputs = PassageQuestionInput(
226
+ passage=sample["article"],
227
+ passage_prefix="Ngữ cảnh: ",
228
+ question=sample["question"],
229
+ question_prefix="Câu hỏi: ",
230
+ separator="\n\n",
231
+ )
232
+
233
+ correct_idx = self.correct_answer_mapping[sample["answer"]]
234
+ references = []
235
+ for idx, answer in enumerate(eval(sample["options"])):
236
+ if idx == correct_idx:
237
+ tags = [CORRECT_TAG]
238
+ else:
239
+ tags = []
240
+
241
+ references.append(Reference(Output(text=answer), tags=tags))
242
+
243
+ if self.randomize_order:
244
+ random.shuffle(references)
245
+
246
+ return inputs, references