crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show
  1. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
  2. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
  3. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
  5. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  6. helm/benchmark/annotation/air_bench_annotator.py +2 -2
  7. helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
  8. helm/benchmark/annotation/bird_sql_annotator.py +2 -2
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
  10. helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
  11. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
  12. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  13. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  14. helm/benchmark/annotation/model_as_judge.py +12 -16
  15. helm/benchmark/annotation/omni_math_annotator.py +13 -14
  16. helm/benchmark/annotation/wildbench_annotator.py +9 -9
  17. helm/benchmark/executor.py +11 -12
  18. helm/benchmark/metrics/aci_bench_metrics.py +9 -29
  19. helm/benchmark/metrics/bias_word_lists.py +1 -1
  20. helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
  21. helm/benchmark/metrics/classification_metrics.py +3 -3
  22. helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
  23. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  24. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  25. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  26. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  27. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  28. helm/benchmark/metrics/comet_metric.py +1 -1
  29. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
  30. helm/benchmark/metrics/copyright_metrics.py +1 -1
  31. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  32. helm/benchmark/metrics/dischargeme_metrics.py +9 -29
  33. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  34. helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
  35. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  36. helm/benchmark/metrics/ifeval_metrics.py +2 -2
  37. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  38. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  39. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  40. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  41. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  42. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  43. helm/benchmark/metrics/med_dialog_metrics.py +9 -29
  44. helm/benchmark/metrics/medalign_metrics.py +9 -29
  45. helm/benchmark/metrics/medi_qa_metrics.py +9 -29
  46. helm/benchmark/metrics/medication_qa_metrics.py +10 -30
  47. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  48. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  49. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  50. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  51. helm/benchmark/metrics/mental_health_metrics.py +9 -29
  52. helm/benchmark/metrics/metric_service.py +11 -11
  53. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  54. helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
  55. helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
  56. helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
  57. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  58. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  59. helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
  60. helm/benchmark/metrics/summac/model_summac.py +2 -3
  61. helm/benchmark/metrics/summarization_metrics.py +2 -1
  62. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
  63. helm/benchmark/metrics/toxicity_metrics.py +2 -2
  64. helm/benchmark/metrics/unitxt_metrics.py +3 -4
  65. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  66. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  67. helm/benchmark/model_deployment_registry.py +16 -26
  68. helm/benchmark/presentation/contamination.py +3 -3
  69. helm/benchmark/presentation/create_plots.py +43 -13
  70. helm/benchmark/presentation/run_display.py +13 -0
  71. helm/benchmark/presentation/schema.py +7 -1
  72. helm/benchmark/presentation/summarize.py +84 -61
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/reeval_run.py +3 -4
  75. helm/benchmark/reeval_runner.py +3 -3
  76. helm/benchmark/run.py +84 -73
  77. helm/benchmark/run_expander.py +12 -1
  78. helm/benchmark/run_spec_factory.py +7 -6
  79. helm/benchmark/run_specs/arabic_run_specs.py +73 -0
  80. helm/benchmark/run_specs/audio_run_specs.py +52 -8
  81. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  82. helm/benchmark/run_specs/classic_run_specs.py +0 -53
  83. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  84. helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
  85. helm/benchmark/run_specs/experimental_run_specs.py +31 -1
  86. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  87. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  88. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  89. helm/benchmark/run_specs/long_context_run_specs.py +114 -15
  90. helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
  91. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  92. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  93. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
  94. helm/benchmark/run_specs/vlm_run_specs.py +28 -0
  95. helm/benchmark/runner.py +5 -5
  96. helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
  97. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  98. helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
  99. helm/benchmark/scenarios/aratrust_scenario.py +76 -0
  100. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
  101. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
  102. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  103. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  104. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  105. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
  106. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
  107. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
  108. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
  109. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
  110. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
  111. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
  112. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
  113. helm/benchmark/scenarios/bluex_scenario.py +66 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
  115. helm/benchmark/scenarios/clear_scenario.py +11 -7
  116. helm/benchmark/scenarios/cleva_scenario.py +1 -1
  117. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  118. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  119. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  120. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  121. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  122. helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
  123. helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
  124. helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
  125. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  126. helm/benchmark/scenarios/grammar.py +2 -2
  127. helm/benchmark/scenarios/headqa_scenario.py +6 -1
  128. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  129. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
  130. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  131. helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
  132. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  133. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  134. helm/benchmark/scenarios/math_scenario.py +21 -20
  135. helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
  136. helm/benchmark/scenarios/medalign_scenario.py +9 -3
  137. helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
  138. helm/benchmark/scenarios/medbullets_scenario.py +7 -2
  139. helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
  140. helm/benchmark/scenarios/medec_scenario.py +6 -1
  141. helm/benchmark/scenarios/medhallu_scenario.py +7 -1
  142. helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
  143. helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
  144. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  145. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  146. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  147. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  148. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  149. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  150. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  151. helm/benchmark/scenarios/mental_health_scenario.py +16 -5
  152. helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
  153. helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
  154. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
  155. helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
  156. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  157. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
  158. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
  159. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
  160. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  161. helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
  162. helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
  163. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
  164. helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
  165. helm/benchmark/scenarios/seahelm_scenario.py +2 -2
  166. helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
  167. helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
  168. helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
  169. helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
  170. helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
  171. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  172. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  173. helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
  174. helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
  175. helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
  176. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
  177. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  178. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  179. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  180. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  181. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  182. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  183. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  184. helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
  185. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  186. helm/benchmark/server.py +2 -1
  187. helm/benchmark/slurm_jobs.py +1 -2
  188. helm/benchmark/slurm_runner.py +8 -1
  189. helm/benchmark/static/schema_arabic.yaml +228 -0
  190. helm/benchmark/static/schema_audio.yaml +60 -49
  191. helm/benchmark/static/schema_classic.yaml +0 -17
  192. helm/benchmark/static/schema_enterprise.yaml +21 -0
  193. helm/benchmark/static/schema_long_context.yaml +81 -20
  194. helm/benchmark/static/schema_medhelm.yaml +272 -213
  195. helm/benchmark/static/schema_melt.yaml +1257 -0
  196. helm/benchmark/static/schema_slphelm.yaml +162 -0
  197. helm/benchmark/static/schema_vhelm.yaml +26 -26
  198. helm/benchmark/static/schema_video.yaml +219 -0
  199. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  200. helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
  201. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  202. helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
  203. helm/benchmark/static_build/index.html +4 -4
  204. helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
  205. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  206. helm/benchmark/window_services/test_utils.py +3 -4
  207. helm/benchmark/window_services/tokenizer_service.py +7 -8
  208. helm/clients/anthropic_client.py +69 -29
  209. helm/clients/audio_language/diva_llama_client.py +4 -2
  210. helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
  211. helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
  212. helm/clients/audio_language/qwen_audiolm_client.py +4 -2
  213. helm/clients/audio_language/test.py +62 -0
  214. helm/clients/bedrock_client.py +3 -1
  215. helm/clients/client.py +7 -7
  216. helm/clients/grok_client.py +36 -0
  217. helm/clients/huggingface_client.py +42 -3
  218. helm/clients/huggingface_pipeline_client.py +138 -0
  219. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  220. helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
  221. helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
  222. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  223. helm/clients/openai_client.py +102 -55
  224. helm/clients/openai_responses_client.py +176 -0
  225. helm/clients/palmyra_client.py +2 -5
  226. helm/clients/reka_client.py +2 -2
  227. helm/clients/test_huggingface_client.py +3 -3
  228. helm/clients/together_client.py +31 -6
  229. helm/clients/vertexai_client.py +17 -9
  230. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  231. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  232. helm/clients/vision_language/idefics_client.py +6 -2
  233. helm/clients/vision_language/paligemma_client.py +2 -2
  234. helm/clients/vision_language/qwen2_vlm_client.py +66 -53
  235. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  236. helm/clients/vllm_client.py +43 -7
  237. helm/clients/vllm_granite_thinking_client.py +56 -0
  238. helm/clients/writer_client.py +102 -0
  239. helm/common/context.py +80 -0
  240. helm/common/credentials_utils.py +5 -5
  241. helm/common/critique_request.py +0 -1
  242. helm/common/general.py +9 -2
  243. helm/common/hierarchical_logger.py +104 -12
  244. helm/common/local_context.py +140 -0
  245. helm/common/object_spec.py +23 -8
  246. helm/common/remote_context.py +61 -0
  247. helm/common/request.py +8 -0
  248. helm/common/test_logging.py +94 -0
  249. helm/config/model_deployments.yaml +995 -45
  250. helm/config/model_metadata.yaml +780 -59
  251. helm/config/tokenizer_configs.yaml +224 -3
  252. helm/proxy/cli.py +4 -2
  253. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  254. helm/proxy/retry.py +5 -0
  255. helm/proxy/services/server_service.py +21 -85
  256. helm/tokenizers/grok_tokenizer.py +55 -0
  257. helm/tokenizers/huggingface_tokenizer.py +1 -1
  258. helm/tokenizers/test_grok_tokenizer.py +33 -0
  259. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  260. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  261. helm/benchmark/scenarios/numeracy_scenario.py +0 -793
  262. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
  263. helm/benchmark/static_build/assets/index-262903c1.js +0 -10
  264. helm/benchmark/static_build/assets/index-42060d71.css +0 -1
  265. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
  266. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
  267. {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
  268. /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
@@ -0,0 +1,115 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+
6
+ from helm.common.general import ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ VALID_SPLIT,
9
+ Scenario,
10
+ Instance,
11
+ Reference,
12
+ TEST_SPLIT,
13
+ TRAIN_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
18
+ from helm.common.hierarchical_logger import hwarn
19
+
20
+
21
+ class EXAMSMultilingualScenario(Scenario):
22
+ """EXAMS: A Multi-subject High School Examinations Dataset
23
+
24
+ EXAMS is a benchmark dataset for multilingual and cross-lingual
25
+ question answering from high school examinations. It consists of
26
+ more than 24,000 high-quality high school exam questions in 16
27
+ languages, covering 8 language families and 24 school subjects
28
+ from Natural Sciences and Social Sciences, among others.
29
+
30
+ - https://huggingface.co/datasets/mhardalov/exams
31
+ - https://aclanthology.org/2020.emnlp-main.438/
32
+
33
+ Note: Some dataset rows have the value '@' in the `answerKey` column.
34
+ These rows will be ignored.
35
+
36
+ ```
37
+ @inproceedings{hardalov-etal-2020-exams,
38
+ title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
39
+ author = "Hardalov, Momchil and
40
+ Mihaylov, Todor and
41
+ Zlatkova, Dimitrina and
42
+ Dinkov, Yoan and
43
+ Koychev, Ivan and
44
+ Nakov, Preslav",
45
+ editor = "Webber, Bonnie and
46
+ Cohn, Trevor and
47
+ He, Yulan and
48
+ Liu, Yang",
49
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
50
+ month = nov,
51
+ year = "2020",
52
+ address = "Online",
53
+ publisher = "Association for Computational Linguistics",
54
+ url = "https://aclanthology.org/2020.emnlp-main.438/",
55
+ doi = "10.18653/v1/2020.emnlp-main.438",
56
+ pages = "5427--5444",
57
+ abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
58
+ }```
59
+ """ # noqa: E501
60
+
61
+ name = "exams_multilingual"
62
+ description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
63
+ tags = ["knowledge", "multiple_choice"]
64
+
65
+ CHOICES = ["A", "B", "C", "D", "E"]
66
+ HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
67
+
68
+ def __init__(self, language: str, subject: str):
69
+ super().__init__()
70
+ self.language = language
71
+ self.subject = subject
72
+
73
+ def get_instances(self, output_path: str) -> List[Instance]:
74
+ cache_dir = os.path.join(output_path, "data")
75
+ ensure_directory_exists(cache_dir)
76
+ dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
77
+ "mhardalov/exams",
78
+ "multilingual",
79
+ revision="4ff10804abb3341f8815cacd778181177bba7edd",
80
+ cache_dir=cache_dir,
81
+ )
82
+
83
+ # Read all instances
84
+ instances: List[Instance] = []
85
+ for split_name, dataset in dataset_splits.items():
86
+ assert isinstance(dataset, datasets.Dataset)
87
+ for row in dataset:
88
+ question = row["question"]
89
+ question_info = row["info"]
90
+ if self.subject != "all" and question_info["subject"] != self.subject:
91
+ continue
92
+ if self.language != "all" and question_info["language"] != self.language:
93
+ continue
94
+ input = Input(text=question["stem"])
95
+ references: List[Reference] = []
96
+ if row["answerKey"] not in self.CHOICES:
97
+ hwarn(f"Invalid value in answerKey column in row: {row}")
98
+ continue
99
+ correct_choice_index = ord(row["answerKey"]) - ord("A")
100
+ for choice_index, choice_text in enumerate(question["choices"]["text"]):
101
+ references.append(
102
+ Reference(
103
+ output=Output(text=choice_text),
104
+ tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
105
+ )
106
+ )
107
+ instance = Instance(
108
+ id=row["id"],
109
+ input=input,
110
+ references=references,
111
+ split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
112
+ )
113
+ instances.append(instance)
114
+
115
+ return instances
@@ -2,7 +2,7 @@ from collections import defaultdict
2
2
  from dataclasses import dataclass, field, replace
3
3
  from functools import cached_property
4
4
  from typing import List, Optional
5
- from helm.common.hierarchical_logger import hlog
5
+ from helm.common.hierarchical_logger import hwarn
6
6
 
7
7
  import dacite
8
8
  import re
@@ -111,7 +111,7 @@ def validate_grammar(grammar: Grammar):
111
111
  # Make sure all categories are defined
112
112
  for category in expansion.categories:
113
113
  if category not in grammar.category_to_rules:
114
- hlog(f"WARNING: Category {category} is not defined")
114
+ hwarn(f"Category {category} is not defined")
115
115
 
116
116
 
117
117
  def read_grammar(path: str) -> Grammar:
@@ -57,7 +57,12 @@ class HeadQAScenario(Scenario):
57
57
  SKIP_TEXTQA: bool = False
58
58
 
59
59
  name = "head_qa"
60
- description = "A collection of biomedical multiple-choice questions for testing medical knowledge."
60
+ description = (
61
+ "HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to"
62
+ "evaluate a model's medical knowledge and reasoning. Each instance presents a clinical"
63
+ "or scientific question with four answer options, requiring the model to select the most"
64
+ "appropriate answer."
65
+ )
61
66
  tags = ["question_answering", "biomedical", "medicine"]
62
67
 
63
68
  def __init__(self, language: str = "en", category: Optional[str] = None):
@@ -0,0 +1,80 @@
1
+ from typing import Any, List
2
+ import re
3
+ from pathlib import Path
4
+ from datasets import load_dataset
5
+ from helm.benchmark.scenarios.scenario import (
6
+ Scenario,
7
+ Instance,
8
+ Reference,
9
+ TEST_SPLIT,
10
+ CORRECT_TAG,
11
+ Input,
12
+ Output,
13
+ )
14
+
15
+
16
+ class HEALTHQA_BR_Scenario(Scenario):
17
+ """
18
+ HealthQA-BR is a large-scale benchmark designed to evaluate the clinical knowledge of Large Language Models (LLMs)
19
+ within the Brazilian Unified Health System (SUS) context. It comprises 5,632 multiple-choice questions sourced from
20
+ nationwide licensing exams and residency tests, reflecting real challenges faced by Brazil's public health sector.
21
+ Unlike benchmarks focused on the U.S. medical landscape, HealthQA-BR targets the Brazilian healthcare ecosystem,
22
+ covering a wide range of medical specialties and interdisciplinary professions such as nursing, dentistry,
23
+ psychology, social work, pharmacy, and physiotherapy. This comprehensive approach enables a detailed assessment
24
+ of AI models’ ability to collaborate effectively in the team-based patient care typical of SUS.
25
+ """
26
+
27
+ name = "healthqa_br"
28
+ description = "MQA benchmark with questions from Brazilian entrance exams"
29
+ tags = ["knowledge", "multiple_choice", "pt-br"]
30
+
31
+ def get_instances(self, output_path: str) -> List[Instance]:
32
+ # Download the raw data and read all the dialogues
33
+ dataset: Any
34
+ # Read all the instances
35
+ instances: List[Instance] = []
36
+ cache_dir = str(Path(output_path) / "data")
37
+
38
+ dataset = load_dataset("Larxel/healthqa-br", cache_dir=cache_dir)
39
+ for example in dataset["train"]:
40
+ question_choices = example["question"]
41
+ answer = example["answer"].strip().upper()
42
+
43
+ # Separate the question statement from the alternatives
44
+ question_text, choices_text = self.split_question_and_choices(question_choices)
45
+
46
+ # Extract alternatives from text choices_text
47
+ pattern = r"'([A-Z])':\s*'([^']+)'"
48
+ matches = re.findall(pattern, choices_text)
49
+ answers_dict = {label: text for label, text in matches}
50
+
51
+ if answer not in answers_dict:
52
+ continue
53
+
54
+ correct_answer_text = answers_dict[answer]
55
+
56
+ def answer_to_reference(answer: str) -> Reference:
57
+ return Reference(Output(text=answer), tags=[CORRECT_TAG] if correct_answer_text == answer else [])
58
+
59
+ instance = Instance(
60
+ input=Input(text=question_text),
61
+ split=TEST_SPLIT,
62
+ references=[answer_to_reference(text) for text in answers_dict.values()],
63
+ )
64
+ instances.append(instance)
65
+ return instances
66
+
67
+ def split_question_and_choices(self, full_text: str):
68
+ # Search for the first occurrence of the alternative pattern
69
+ match = re.search(r"\n'[A-Z]':\s*'.+", full_text)
70
+ if match:
71
+ # Everything before the alternatives
72
+ question_part = full_text[: match.start()].strip()
73
+ # All of the alternatives (from match to end)
74
+ choices_part = full_text[match.start() :].strip()
75
+ else:
76
+ # If you don't find a pattern, consider everything as a question, and no alternative.
77
+ question_part = full_text.strip()
78
+ choices_part = ""
79
+
80
+ return question_part, choices_part
@@ -0,0 +1,90 @@
1
+ import os
2
+ import re
3
+ from typing import List
4
+
5
+ from datasets import load_dataset, Features, Value, Sequence, Dataset
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Input,
11
+ Reference,
12
+ Output,
13
+ CORRECT_TAG,
14
+ TEST_SPLIT,
15
+ )
16
+ from helm.common.general import ensure_directory_exists
17
+
18
+
19
+ class InfiniteBenchEnMCScenario(Scenario):
20
+ """InfiniteBench En.MC
21
+
22
+ InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
23
+ understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
24
+ InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
25
+ long-range dependency and reasoning, beyond simple short passage retrieval.
26
+ """
27
+
28
+ name = "infinite_bench_en_mc"
29
+ description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
30
+ tags = ["question_answering"]
31
+
32
+ def __init__(self, max_num_words: int):
33
+ self.max_num_words = max_num_words
34
+ super().__init__()
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ # Get InfiniteBench from HuggingFace
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+
41
+ # Define the features schema
42
+ ft = Features(
43
+ {
44
+ "id": Value("int64"),
45
+ "context": Value("string"),
46
+ "input": Value("string"),
47
+ "answer": Sequence(Value("string")),
48
+ "options": Sequence(Value("string")),
49
+ }
50
+ )
51
+
52
+ # Load the dataset with the specified features
53
+ dataset = load_dataset(
54
+ "xinrongzhang2022/InfiniteBench",
55
+ split="longbook_choice_eng",
56
+ features=ft,
57
+ revision="90f0394333616266d9fe85824ceaf505093cbaa5",
58
+ )
59
+
60
+ assert isinstance(dataset, Dataset)
61
+
62
+ def count_words(text: str) -> int:
63
+ return len(re.split(r"\s+", text.strip()))
64
+
65
+ dataset = dataset.filter(
66
+ lambda example: count_words(example["context"])
67
+ + count_words(example["input"])
68
+ + sum(count_words(option) for option in example["options"])
69
+ <= self.max_num_words
70
+ )
71
+
72
+ # Read all instances
73
+ instances: List[Instance] = []
74
+ for row in dataset:
75
+ assert len(row["answer"]) == 1
76
+ id = row["id"]
77
+ input = Input(text=row["context"] + "\n\n" + row["input"])
78
+ references = [
79
+ Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
80
+ for option in row["options"]
81
+ ]
82
+ instance = Instance(
83
+ id=id,
84
+ input=input,
85
+ references=references,
86
+ split=TEST_SPLIT,
87
+ )
88
+ instances.append(instance)
89
+
90
+ return instances
@@ -0,0 +1,85 @@
1
+ import os
2
+ import re
3
+ from typing import List
4
+
5
+ from datasets import load_dataset, Features, Value, Sequence, Dataset
6
+
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Input,
11
+ Reference,
12
+ Output,
13
+ CORRECT_TAG,
14
+ TEST_SPLIT,
15
+ )
16
+ from helm.common.general import ensure_directory_exists
17
+
18
+
19
+ class InfiniteBenchEnQAScenario(Scenario):
20
+ """InfiniteBench En.QA
21
+
22
+ InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
23
+ understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
24
+ InfiniteBench that requires models to perform open-form question answering on questions that necessitate
25
+ long-range dependency and reasoning, beyond simple short passage retrieval.
26
+ """
27
+
28
+ name = "infinite_bench_en_qa"
29
+ description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
30
+ tags = ["question_answering"]
31
+
32
+ def __init__(self, max_num_words: int):
33
+ self.max_num_words = max_num_words
34
+ super().__init__()
35
+
36
+ def get_instances(self, output_path: str) -> List[Instance]:
37
+ # Get InfiniteBench from HuggingFace
38
+ cache_dir = os.path.join(output_path, "data")
39
+ ensure_directory_exists(cache_dir)
40
+
41
+ # Define the features schema
42
+ ft = Features(
43
+ {
44
+ "id": Value("int64"),
45
+ "context": Value("string"),
46
+ "input": Value("string"),
47
+ "answer": Sequence(Value("string")),
48
+ "options": Sequence(Value("string")),
49
+ }
50
+ )
51
+
52
+ # Load the dataset with the specified features
53
+ dataset = load_dataset(
54
+ "xinrongzhang2022/InfiniteBench",
55
+ split="longbook_qa_eng",
56
+ features=ft,
57
+ revision="90f0394333616266d9fe85824ceaf505093cbaa5",
58
+ )
59
+
60
+ assert isinstance(dataset, Dataset)
61
+
62
+ def count_words(text: str) -> int:
63
+ return len(re.split(r"\s+", text.strip()))
64
+
65
+ dataset = dataset.filter(
66
+ lambda example: count_words(example["context"])
67
+ + count_words(example["input"])
68
+ + sum(count_words(option) for option in example["options"])
69
+ <= self.max_num_words
70
+ )
71
+
72
+ # Read all instances
73
+ instances: List[Instance] = []
74
+ for row in dataset:
75
+ id = row["id"]
76
+ input = Input(text=row["context"] + "\n\n" + row["input"])
77
+ instance = Instance(
78
+ id=id,
79
+ input=input,
80
+ references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
81
+ split=TEST_SPLIT,
82
+ )
83
+ instances.append(instance)
84
+
85
+ return instances
@@ -14,21 +14,19 @@ from helm.benchmark.scenarios.scenario import (
14
14
  from helm.common.general import ensure_directory_exists
15
15
 
16
16
 
17
- class InfiniteBenchSumScenario(Scenario):
18
- """InfiniteBench Sum
17
+ class InfiniteBenchEnSumScenario(Scenario):
18
+ """InfiniteBench En.Sum
19
19
 
20
20
  InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
21
- understand, and reason over super long contexts (100k+ tokens). InfiniteBench Sum is a subset of
22
- InfiniteBench that requires models to generate a concise summary of the novel. The subset is referred
23
- to as "En.Sum" in the original paper.
21
+ understand, and reason over super long contexts (100k+ tokens). InfiniteBench En.Sum is a subset of
22
+ InfiniteBench that requires models to generate a concise summary of the novel.
24
23
  """
25
24
 
26
- name = "infinite_bench_sum"
27
- description = "Summarize a novel from InfiniteBench"
25
+ name = "infinite_bench_en_sum"
26
+ description = "∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
28
27
  tags = ["summarization"]
29
28
 
30
- def __init__(self, min_num_words: int, max_num_words: int):
31
- self.min_num_words = min_num_words
29
+ def __init__(self, max_num_words: int):
32
30
  self.max_num_words = max_num_words
33
31
  super().__init__()
34
32
 
@@ -61,9 +59,9 @@ class InfiniteBenchSumScenario(Scenario):
61
59
  def count_words(text: str) -> int:
62
60
  return len(re.split(r"\s+", text.strip()))
63
61
 
64
- dataset = dataset.map(
65
- lambda example: {"prompt_wc": count_words(example["context"]) + count_words(example["input"])}
66
- ).filter(lambda example: self.min_num_words <= example["prompt_wc"] <= self.max_num_words)
62
+ dataset = dataset.filter(
63
+ lambda example: count_words(example["context"]) + count_words(example["input"]) <= self.max_num_words
64
+ )
67
65
 
68
66
  # Read all instances
69
67
  instances: List[Instance] = []
@@ -75,7 +73,6 @@ class InfiniteBenchSumScenario(Scenario):
75
73
  input=input,
76
74
  references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
77
75
  split=TEST_SPLIT,
78
- extra_data={"word_count": row["prompt_wc"]},
79
76
  )
80
77
  instances.append(instance)
81
78
 
@@ -0,0 +1,151 @@
1
+ import os
2
+ from typing import List, Dict
3
+ import json
4
+ import re
5
+
6
+ from helm.common.general import ensure_file_downloaded, ensure_directory_exists
7
+ from helm.benchmark.scenarios.scenario import (
8
+ Scenario,
9
+ Instance,
10
+ Reference,
11
+ TRAIN_SPLIT,
12
+ VALID_SPLIT,
13
+ TEST_SPLIT,
14
+ CORRECT_TAG,
15
+ Input,
16
+ Output,
17
+ )
18
+
19
+
20
+ class KPIEDGARScenario(Scenario):
21
+ """A financial named entity recognition (NER) scenario based on KPI-EDGAR (T. Deußer et al., 2022).
22
+
23
+ This scenario has been modified from the paper. The original paper has 12 entity types and requires the model
24
+ to extract pairs of related entities. This scenario only use four named entity types (kpi, cy, py, py1) and only
25
+ requires the model to extract individual entities.
26
+
27
+ Paper:
28
+ T. Deußer et al.,
29
+ “KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents.” 2022.
30
+ https://arxiv.org/abs/2210.09163
31
+
32
+ Prompt format:
33
+
34
+ ```
35
+ Context: {Sentence}
36
+ Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
37
+ kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
38
+ Answer:
39
+ ```
40
+
41
+ Example input:
42
+
43
+ ```
44
+ Context: The following table summarizes our total share-based compensation expense and excess tax benefits recognized : As of December 28 , 2019 , there was $ 284 million of total unrecognized compensation cost related to nonvested share-based compensation grants .
45
+ Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
46
+ kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
47
+ Answer:
48
+ ```
49
+
50
+ Example reference:
51
+ ```
52
+ 284 [cy], total unrecognized compensation cost [kpi]
53
+ ```""" # noqa: E501
54
+
55
+ name = "kpi_edgar"
56
+ description = "Named Entity Recognition from financial documents."
57
+ tags = ["named_entity_recognition", "finance"]
58
+
59
+ TAG_DICT = {
60
+ "kpi": "Key Performance Indicators expressible in numerical and monetary value",
61
+ "cy": "Current Year monetary value",
62
+ "py": "Prior Year monetary value",
63
+ "py1": "Two Year Past Value",
64
+ }
65
+ TAG_DESCRIPTIONS = ", ".join(["%s: %s" % (key, val) for (key, val) in TAG_DICT.items()]) + "."
66
+ TAG_PAREN_RE = (r"\[", r"\]")
67
+ TAG_PAREN = tuple((e.strip("\\") for e in TAG_PAREN_RE))
68
+ TAG_PAREN_ESC = ("(", ")")
69
+ DATASET_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "valid": VALID_SPLIT, "test": TEST_SPLIT}
70
+ JSON_URL = "https://raw.githubusercontent.com/tobideusser/kpi-edgar/2ec7084dcd55b4979bbe288d4aa1e962c685c9ab/data/kpi_edgar.json" # noqa: E501
71
+ JSON_FILENAME = "kpi_edgar.json"
72
+
73
+ @staticmethod
74
+ def get_sentences(dataset: List[Dict]) -> List[Dict]:
75
+ return [
76
+ sentence
77
+ for document in dataset
78
+ for segment in document["segments"]
79
+ for sentence in segment["sentences"] or []
80
+ ]
81
+
82
+ @staticmethod
83
+ def escape_parenthesis(text: str) -> str:
84
+ tmp0 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[0], KPIEDGARScenario.TAG_PAREN_ESC[0], text)
85
+ tmp1 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[1], KPIEDGARScenario.TAG_PAREN_ESC[1], tmp0)
86
+ return tmp1
87
+
88
+ @staticmethod
89
+ def get_output_text(
90
+ words: List[str],
91
+ annotations: List[Dict],
92
+ ) -> str:
93
+ # def get_entity_for_annotation(words: List[str], annotation: Dict) -> str
94
+ entities: List[str] = []
95
+ for annotation in annotations:
96
+ annotation_type = annotation["type_"]
97
+ if annotation_type not in KPIEDGARScenario.TAG_DICT:
98
+ continue
99
+ start_idx = annotation["start"]
100
+ end_idx = annotation["end"]
101
+ annotated_words = words[start_idx:end_idx]
102
+ phrase = KPIEDGARScenario.escape_parenthesis(" ".join(annotated_words))
103
+ entities.append(
104
+ "%s %s%s%s" % (phrase, KPIEDGARScenario.TAG_PAREN[0], annotation_type, KPIEDGARScenario.TAG_PAREN[1])
105
+ )
106
+
107
+ return ", ".join(entities)
108
+
109
+ @staticmethod
110
+ def sentences_to_instances(sentences: List[Dict]) -> List[Instance]:
111
+ instances: List[Instance] = []
112
+ for sentence in sentences:
113
+ dataset_split: str = sentence["split_type"]
114
+ if dataset_split is None:
115
+ continue
116
+ split = KPIEDGARScenario.DATASET_SPLIT_TO_HELM_SPLIT[dataset_split]
117
+
118
+ words: List[str] = [word_dict["value"] for word_dict in sentence["words"]]
119
+ passage = KPIEDGARScenario.escape_parenthesis(" ".join(words))
120
+ input_text = (
121
+ "Context: %s\n"
122
+ "Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.\n" # noqa: E501
123
+ "%s" % (passage, KPIEDGARScenario.TAG_DESCRIPTIONS)
124
+ )
125
+
126
+ annotations = sentence["entities_anno"]
127
+ output_text = KPIEDGARScenario.get_output_text(words, annotations)
128
+ if not output_text:
129
+ continue
130
+
131
+ instances.append(
132
+ Instance(
133
+ input=Input(text=input_text),
134
+ references=[Reference(Output(text=output_text), tags=[CORRECT_TAG])],
135
+ split=split,
136
+ )
137
+ )
138
+ return instances
139
+
140
+ def get_instances(self, output_path: str) -> List[Instance]:
141
+ data_path = os.path.join(output_path, "data")
142
+ ensure_directory_exists(data_path)
143
+
144
+ base_url = self.JSON_URL
145
+ dataset_file_name = self.JSON_FILENAME
146
+ target_path = os.path.join(data_path, dataset_file_name)
147
+ ensure_file_downloaded(source_url=base_url, target_path=target_path)
148
+
149
+ with open(target_path, "r") as f:
150
+ raw_dataset = json.load(f)
151
+ return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))