crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class MedDialogMetric(Metric):
13
+ """Score metrics for MedDialog."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["med_dialog"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("med_dialog_accuracy")).add(score),
34
+ ]
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class MedalignMetric(Metric):
13
+ """Score metrics for Medalign."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["medalign"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("medalign_accuracy")).add(score),
34
+ ]
@@ -0,0 +1,124 @@
1
+ import re
2
+
3
+ from datetime import datetime
4
+ from typing import List, Dict, Any
5
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.metrics.metric import Metric
8
+ from helm.benchmark.metrics.metric_name import MetricName
9
+ from helm.benchmark.metrics.metric_service import MetricService
10
+ from helm.benchmark.metrics.statistic import Stat
11
+ from helm.common.hierarchical_logger import hlog
12
+
13
+
14
+ class MedCalcBenchMetric(Metric):
15
+ """
16
+ Metric for evaluating the MedCalc Bench dataset, assessing the model's ability to
17
+ be a clinical calculator.
18
+
19
+ Exact match based on category:
20
+ 1. Normal exact match: for categories "risk", "severity" or "diagnosis".
21
+ 2. Variant exact match: for other categories, if the number calculated by the model falls between the values
22
+ in the Lower limit and Upper limit columns, we mark it as accurate.
23
+ """
24
+
25
+ def parse_duration(self, duration_str) -> int:
26
+ """Parses a duration tuple (weeks, days) from a string format like ('14 weeks', '2 days')."""
27
+ match = re.match(r"\('(\d+) weeks', '(\d+) days'\)", duration_str)
28
+ if match:
29
+ weeks, days = map(int, match.groups())
30
+ return weeks * 7 + days # Convert to total days
31
+ else:
32
+ raise ValueError(f"Invalid format: {duration_str}")
33
+
34
+ def is_within_range(self, lower_bound, upper_bound, prediction) -> int:
35
+ """
36
+ Checks if a predicted duration falls within the given range.
37
+
38
+ Args:
39
+ lower_bound (str): The lower bound in format "('X weeks', 'Y days')".
40
+ upper_bound (str): The upper bound in format "('X weeks', 'Y days')".
41
+ prediction (str): The predicted duration in the same format.
42
+
43
+ Returns:
44
+ int: 1 if within range (inclusive), 0 otherwise.
45
+ """
46
+ lower_days = self.parse_duration(lower_bound)
47
+ upper_days = self.parse_duration(upper_bound)
48
+ prediction_days = self.parse_duration(prediction)
49
+ return 1 if lower_days <= prediction_days <= upper_days else 0
50
+
51
+ def check_date(self, prediction: str, reference: str, extra_data: Dict[str, Any]) -> int:
52
+ """Checks if prediction date is withing limits"""
53
+ if re.match(r"\('(\d+) weeks', '(\d+) days'\)", reference):
54
+ exact_match = self.is_within_range(extra_data["lower_limit"], extra_data["upper_limit"], prediction)
55
+ else:
56
+ prediction_date = self._str_to_date(prediction)
57
+ upper_limit_date = self._str_to_date(extra_data["upper_limit"])
58
+ lower_limit_date = self._str_to_date(extra_data["lower_limit"])
59
+ exact_match = 1 if lower_limit_date <= prediction_date <= upper_limit_date else 0
60
+ return exact_match
61
+
62
+ def _str_to_date(self, date_str: str) -> datetime:
63
+ """Convert string to datetime object."""
64
+ return datetime.strptime(date_str, "%m/%d/%Y")
65
+
66
+ def check_in_range(self, prediction: str, reference: str, extra_data: Dict[str, Any], category: str) -> int:
67
+ """Check if the prediction falls within the range specified by the reference."""
68
+ try:
69
+ if category == "date":
70
+ exact_match = self.check_date(prediction, reference, extra_data)
71
+ elif category in ["dosage conversion", "physical"]:
72
+ lower_limit = float(extra_data["lower_limit"])
73
+ upper_limit = float(extra_data["upper_limit"])
74
+ float_prediction = float(prediction)
75
+ exact_match = 1 if lower_limit <= float_prediction <= upper_limit else 0
76
+ else:
77
+ raise ValueError(f"Category {category} not supported")
78
+ except ValueError:
79
+ return 0
80
+
81
+ return exact_match
82
+
83
+ def evaluate_generation(
84
+ self,
85
+ adapter_spec: AdapterSpec,
86
+ request_state: RequestState,
87
+ metric_service: MetricService,
88
+ eval_cache_path: str,
89
+ ) -> List[Stat]:
90
+ """
91
+ Evaluate a single generation against reference labels.
92
+ """
93
+ # Extract predictions
94
+ assert request_state.result, "request_state.result is unexpectedly None"
95
+ predictions = [completion.text.strip() for completion in request_state.result.completions]
96
+
97
+ if not predictions:
98
+ hlog("Warning: No predictions found in completions")
99
+ return []
100
+
101
+ # Get the first prediction
102
+ prediction = predictions[0]
103
+
104
+ # Get references
105
+ references = getattr(request_state.instance, "references", None)
106
+
107
+ if not references or len(references) == 0:
108
+ hlog(f"Warning: Missing references for instance {request_state.instance}")
109
+ return []
110
+
111
+ reference = references[0].output.text
112
+
113
+ # Extract category, upper limit and lower limit
114
+ assert request_state.instance.extra_data, "Extra data dict was expected but got None"
115
+ category = request_state.instance.extra_data["category"]
116
+
117
+ if category in ["risk", "severity", "diagnosis"]:
118
+ exact_match = 1 if prediction == reference else 0
119
+ else:
120
+ exact_match = self.check_in_range(prediction, reference, request_state.instance.extra_data, category)
121
+
122
+ return [
123
+ Stat(MetricName("medcalc_bench_accuracy")).add(exact_match),
124
+ ]
@@ -0,0 +1,101 @@
1
+ from typing import List
2
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric_name import MetricName
6
+ from helm.benchmark.metrics.metric_service import MetricService
7
+ from helm.benchmark.metrics.statistic import Stat
8
+ from helm.common.hierarchical_logger import hlog
9
+ import re
10
+ from helm.benchmark.scenarios.scenario import CORRECT_TAG
11
+
12
+
13
+ class MedecMetric(Metric):
14
+ """
15
+ Metric for evaluating the MEDEC dataset, assessing medical error detection and correction.
16
+
17
+ - **Error Flag Accuracy**: Whether the model correctly identifies if a medical note contains an error.
18
+ - **Error Sentence Detection Accuracy**: Whether the model correctly identifies the erroneous
19
+ sentence when an error is present.
20
+ """
21
+
22
+ def evaluate_generation(
23
+ self,
24
+ adapter_spec: AdapterSpec,
25
+ request_state: RequestState,
26
+ metric_service: MetricService,
27
+ eval_cache_path: str,
28
+ ) -> List[Stat]:
29
+ """
30
+ Evaluate a single LLM generation against the ground truth labels.
31
+ """
32
+
33
+ # Extract predictions from the model output
34
+ if request_state.result is not None:
35
+ predictions = [completion.text.strip() for completion in request_state.result.completions]
36
+ else:
37
+ predictions = []
38
+
39
+ if not predictions:
40
+ raise ValueError("No predictions found in the completions.")
41
+
42
+ prediction = predictions[0] # Use the first completion
43
+
44
+ # Extract references (ground truth)
45
+ references = getattr(request_state.instance, "references", None)
46
+
47
+ if not references or len(references) == 0:
48
+ hlog(f"Warning: Missing references for instance {request_state.instance}")
49
+ return []
50
+
51
+ ground_truth_reference = next((ref for ref in references if CORRECT_TAG in ref.tags), None)
52
+
53
+ if not ground_truth_reference:
54
+ hlog(f"Warning: No ground truth reference with CORRECT_TAG for instance {request_state.instance}")
55
+ return []
56
+
57
+ # Extract the ground truth error flag and sentence ID
58
+ # ground_truth_text_pre = ground_truth_reference.output.text.strip()
59
+ ground_truth_text = ground_truth_reference.output.text.replace(".", "").strip()
60
+
61
+ # Determine ground truth values
62
+ if ground_truth_text == "CORRECT":
63
+ ground_truth_flag = 0 # No medical error
64
+ ground_truth_sentence_id = -1 # No sentence should be flagged
65
+ else:
66
+ ground_truth_flag = 1 # Medical error exists
67
+ try:
68
+ ground_truth_sentence_id = int(ground_truth_text)
69
+ except ValueError:
70
+ ground_truth_sentence_id = -1
71
+
72
+ # Determine model predictions
73
+ prediction = prediction.strip()
74
+
75
+ if prediction == "CORRECT":
76
+ predicted_flag = 0
77
+ predicted_sentence_id = -1
78
+ else:
79
+ match = re.match(r"^(\d+)", prediction) # Ensure first token is a number
80
+ if match:
81
+ predicted_sentence_id = int(match.group(1))
82
+ predicted_flag = 1 # A valid sentence ID implies the model thinks there's an error
83
+ else:
84
+ predicted_flag = 1 # The model attempted to correct something, but in an invalid format
85
+ predicted_sentence_id = -2 # Mark invalid outputs
86
+
87
+ # Calculate Error Flag Accuracy
88
+ flag_accuracy = 1 if predicted_flag == ground_truth_flag else 0
89
+
90
+ # Calculate Error Sentence Accuracy
91
+ if ground_truth_flag == 1 and predicted_flag == 1:
92
+ sentence_accuracy = 1 if predicted_sentence_id == ground_truth_sentence_id else 0
93
+ else:
94
+ sentence_accuracy = (
95
+ 1 if ground_truth_flag == 0 and predicted_flag == 0 else 0
96
+ ) # Both must agree it's "CORRECT"
97
+
98
+ return [
99
+ Stat(MetricName("medec_error_flag_accuracy")).add(flag_accuracy),
100
+ Stat(MetricName("medec_error_sentence_accuracy")).add(sentence_accuracy),
101
+ ]
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class MediQAMetric(Metric):
13
+ """Score metrics for MediQA."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["medi_qa"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("medi_qa_accuracy")).add(score),
34
+ ]
@@ -1,14 +1,15 @@
1
- from typing import List
1
+ from typing import Any, Dict, List
2
2
 
3
3
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
4
  from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
5
6
  from helm.benchmark.metrics.metric import Metric
6
7
  from helm.benchmark.metrics.metric_name import MetricName
7
8
  from helm.benchmark.metrics.metric_service import MetricService
8
9
  from helm.benchmark.metrics.statistic import Stat
9
10
 
10
11
 
11
- class MedicationQAScoreMetric(Metric):
12
+ class MedicationQAMetric(Metric):
12
13
  """Score metrics for MedicationQA."""
13
14
 
14
15
  def evaluate_generation(
@@ -19,5 +20,15 @@ class MedicationQAScoreMetric(Metric):
19
20
  eval_cache_path: str,
20
21
  ) -> List[Stat]:
21
22
  assert request_state.annotations
22
- score = request_state.annotations["medication_qa"]["score"]
23
- return [Stat(MetricName("medication_qa_score")).add(score)]
23
+ annotations: Dict[str, Any] = request_state.annotations["medication_qa"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("medication_qa_accuracy")).add(score),
34
+ ]
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class MentalHealthMetric(Metric):
13
+ """Score metrics for MentalHealth."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["mental_health"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("mental_health_accuracy")).add(score),
34
+ ]
@@ -15,9 +15,9 @@ from helm.benchmark.adaptation.scenario_state import ScenarioState
15
15
  from helm.benchmark.adaptation.request_state import RequestState
16
16
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
17
17
  from helm.benchmark.scenarios.scenario import Instance
18
- from .metric_name import MetricName, MetricContext
19
- from .metric_service import MetricService
20
- from .statistic import Stat, merge_stat
18
+ from helm.benchmark.metrics.metric_name import MetricName, MetricContext
19
+ from helm.benchmark.metrics.metric_service import MetricService
20
+ from helm.benchmark.metrics.statistic import Stat, merge_stat
21
21
 
22
22
 
23
23
  @dataclass(frozen=True)
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class MIMICRRSMetric(Metric):
13
+ """Score metrics for MIMICRRS."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["mimic_rrs"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("mimic_rrs_accuracy")).add(score),
34
+ ]
@@ -0,0 +1,96 @@
1
+ from typing import List
2
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric_name import MetricName
6
+ from helm.benchmark.metrics.metric_service import MetricService
7
+ from helm.benchmark.metrics.statistic import Stat
8
+ from helm.common.hierarchical_logger import hlog
9
+ import re
10
+ from sklearn.metrics import precision_score, recall_score, f1_score
11
+ from sklearn.preprocessing import MultiLabelBinarizer
12
+
13
+
14
+ class MIMICIVBillingCodeMetric(Metric):
15
+ """
16
+ Metric for evaluating the MIMIC Billing Code dataset, assessing the model's ability to match the
17
+ reference ICD codes. Handles cases where raw prediction output contains additional text.
18
+
19
+ Calculates:
20
+ 1. Precision: proportion of correctly predicted ICD codes among all predicted codes
21
+ 2. Recall: proportion of correctly predicted ICD codes among all reference codes
22
+ 3. F1 score: harmonic mean of precision and recall
23
+
24
+ ICD codes format: letter followed by 1-3 digits, optional period, optional additional digits
25
+ Examples: "J18.9", "J45.909", "J47.1", "J96.01"
26
+ """
27
+
28
+ def extract_icd_codes(self, text: str) -> List[str]:
29
+ """Extract ICD codes from text, handling markdown and standardizing format."""
30
+ if not text:
31
+ return []
32
+
33
+ # Remove markdown bold formatting
34
+ cleaned_text = re.sub(r"\*\*", "", text)
35
+ # Match ICD code pattern with optional period and trailing digits
36
+ pattern = r"\b[A-Z]\d{1,3}(?:\.\d{1,4})?\.?\b"
37
+ codes = re.findall(pattern, cleaned_text)
38
+ # Standardize by removing trailing periods
39
+ return [code.rstrip(".").strip() for code in codes]
40
+
41
+ def evaluate_generation(
42
+ self,
43
+ adapter_spec: AdapterSpec,
44
+ request_state: RequestState,
45
+ metric_service: MetricService,
46
+ eval_cache_path: str,
47
+ ) -> List[Stat]:
48
+ """
49
+ Evaluate a single generation against reference labels.
50
+ """
51
+ # Extract predictions
52
+ if request_state.result is None:
53
+ predictions = []
54
+ else:
55
+ predictions = [completion.text.strip() for completion in request_state.result.completions]
56
+ if not predictions:
57
+ hlog("Warning: No predictions found in completions")
58
+ return []
59
+
60
+ # Get the first prediction
61
+ prediction = predictions[0]
62
+
63
+ # Get references
64
+ references = getattr(request_state.instance, "references", None)
65
+
66
+ if not references or len(references) == 0:
67
+ hlog(f"Warning: Missing references for instance {request_state.instance}")
68
+ return []
69
+
70
+ # Extract codes from reference and prediction
71
+ ref_codes = []
72
+ for ref in references:
73
+ if ref.output.text:
74
+ ref_codes.extend(self.extract_icd_codes(ref.output.text))
75
+ ref_codes = list(set(ref_codes)) # Remove duplicates
76
+
77
+ pred_codes = self.extract_icd_codes(prediction)
78
+ pred_codes = list(set(pred_codes)) # Remove duplicates
79
+
80
+ # Convert to binary format for metrics
81
+ all_codes = sorted(list(set(ref_codes + pred_codes)))
82
+ mlb = MultiLabelBinarizer(classes=all_codes)
83
+
84
+ y_true_bin = mlb.fit_transform([ref_codes])
85
+ y_pred_bin = mlb.transform([pred_codes])
86
+
87
+ # Calculate metrics
88
+ precision = precision_score(y_true_bin, y_pred_bin, average="micro", zero_division=0)
89
+ recall = recall_score(y_true_bin, y_pred_bin, average="micro", zero_division=0)
90
+ f1 = f1_score(y_true_bin, y_pred_bin, average="micro", zero_division=0)
91
+
92
+ return [
93
+ Stat(MetricName("mimiciv_billing_code_precision")).add(precision),
94
+ Stat(MetricName("mimiciv_billing_code_recall")).add(recall),
95
+ Stat(MetricName("mimiciv_billing_code_f1")).add(f1),
96
+ ]
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.mtsamples_procedures_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class MTSamplesProceduresMetric(Metric):
13
+ """Score metrics for MTSamplesProcedures."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["mtsamples_procedures"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("mtsamples_procedures_accuracy")).add(score),
34
+ ]
@@ -0,0 +1,34 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.annotation.mtsamples_replicate_annotator import ANNOTATOR_MODELS
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ class MTSamplesReplicateMetric(Metric):
13
+ """Score metrics for MTSamplesReplicate."""
14
+
15
+ def evaluate_generation(
16
+ self,
17
+ adapter_spec: AdapterSpec,
18
+ request_state: RequestState,
19
+ metric_service: MetricService,
20
+ eval_cache_path: str,
21
+ ) -> List[Stat]:
22
+ assert request_state.annotations
23
+ annotations: Dict[str, Any] = request_state.annotations["mtsamples_replicate"]
24
+ scores: List[int] = []
25
+ score = 0.0
26
+ for annotation_key, annotation_dict in annotations.items():
27
+ if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
28
+ for val in annotation_dict.values():
29
+ scores.append(int(val["score"]))
30
+ if scores:
31
+ score = sum(scores) / len(scores)
32
+ return [
33
+ Stat(MetricName("mtsamples_replicate_accuracy")).add(score),
34
+ ]
@@ -0,0 +1,32 @@
1
+ import nltk
2
+ from importlib.metadata import version
3
+
4
+
5
+ def install_nltk_resources():
6
+ """Install resources for nltk tokenizers, which is required for bleu and rouge scores."""
7
+ # Install "punkt_tab" for nltk>=3.9.1 or "punkt" for nltk<=3.8.1
8
+ #
9
+ # Note that nltk 3.9.0 is disallowed due to https://github.com/nltk/nltk/issues/3308
10
+ #
11
+ # "punkt" is not longer supported for newer versions of nltk due to a security issue
12
+ # and has been replaced by "punkt_tab". For more information, see:
13
+ #
14
+ # - https://github.com/stanford-crfm/helm/issues/2926
15
+ # - https://github.com/nltk/nltk/issues/3293
16
+ # - https://github.com/nltk/nltk/issues/3266
17
+ # - https://nvd.nist.gov/vuln/detail/CVE-2024-39705
18
+ #
19
+ # TODO: Remove support for nltk<=3.8.1 and only install "punkt_tab"
20
+ nltk_major_version, nltk_minor_version = [int(v) for v in version("nltk").split(".")[0:2]]
21
+ if nltk_major_version < 3:
22
+ raise Exception("ntlk version <3 is not supported")
23
+ if nltk_major_version == 3 and nltk_minor_version <= 8:
24
+ try:
25
+ nltk.data.find("tokenizers/punkt")
26
+ except LookupError:
27
+ nltk.download("punkt")
28
+ else:
29
+ try:
30
+ nltk.data.find("tokenizers/punkt_tab")
31
+ except LookupError:
32
+ nltk.download("punkt_tab")
@@ -12,10 +12,10 @@ from helm.benchmark.scenarios.numeracy_scenario import ( # noqa
12
12
  distance_plane,
13
13
  distance_paraboloid,
14
14
  )
15
- from .metric import Metric
16
- from .metric_name import MetricName
17
- from .metric_service import MetricService
18
- from .statistic import Stat
15
+ from helm.benchmark.metrics.metric import Metric
16
+ from helm.benchmark.metrics.metric_name import MetricName
17
+ from helm.benchmark.metrics.metric_service import MetricService
18
+ from helm.benchmark.metrics.statistic import Stat
19
19
 
20
20
 
21
21
  class DistanceMetric(Metric):