crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  31. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  32. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  33. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  34. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  35. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  36. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  37. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  38. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  39. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  40. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  41. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  42. helm/benchmark/annotation/medalign_annotator.py +100 -0
  43. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  44. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  45. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  46. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  47. helm/benchmark/annotation/model_as_judge.py +218 -6
  48. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  49. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  50. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  51. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  52. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  53. helm/benchmark/annotation/spider_annotator.py +18 -0
  54. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  55. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  56. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  57. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  58. helm/benchmark/annotation_executor.py +35 -15
  59. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  60. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  61. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  62. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  63. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  64. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  65. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  66. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  67. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  68. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  69. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  70. helm/benchmark/augmentations/perturbation.py +1 -1
  71. helm/benchmark/augmentations/space_perturbation.py +2 -2
  72. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  73. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  74. helm/benchmark/augmentations/test_perturbation.py +16 -13
  75. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  76. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  77. helm/benchmark/data_preprocessor.py +2 -2
  78. helm/benchmark/huggingface_registration.py +2 -7
  79. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  80. helm/benchmark/metrics/basic_metrics.py +6 -6
  81. helm/benchmark/metrics/bbq_metrics.py +2 -2
  82. helm/benchmark/metrics/bias_metrics.py +12 -3
  83. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  84. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  85. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  86. helm/benchmark/metrics/classification_metrics.py +76 -12
  87. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  88. helm/benchmark/metrics/code_metrics.py +5 -5
  89. helm/benchmark/metrics/comet_metric.py +125 -0
  90. helm/benchmark/metrics/common_metric_specs.py +9 -2
  91. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  92. helm/benchmark/metrics/copyright_metrics.py +4 -4
  93. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  94. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  95. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  96. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  97. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  98. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  99. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  100. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  101. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  102. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  103. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  104. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  105. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  106. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  107. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  108. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  109. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  110. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  111. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  112. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  113. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  114. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  115. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  116. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  117. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  118. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  120. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  121. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  122. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  123. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  124. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  125. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  126. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  127. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  128. helm/benchmark/metrics/medalign_metrics.py +34 -0
  129. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  130. helm/benchmark/metrics/medec_metrics.py +101 -0
  131. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  132. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  133. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  134. helm/benchmark/metrics/metric.py +3 -3
  135. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  136. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  137. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  138. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  139. helm/benchmark/metrics/nltk_helper.py +32 -0
  140. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  141. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  142. helm/benchmark/metrics/output_processing_metric.py +60 -0
  143. helm/benchmark/metrics/output_processors.py +15 -0
  144. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  145. helm/benchmark/metrics/ranking_metrics.py +3 -3
  146. helm/benchmark/metrics/reference_metric.py +3 -3
  147. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  148. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  149. helm/benchmark/metrics/spider_metrics.py +7 -0
  150. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  151. helm/benchmark/metrics/statistic.py +1 -1
  152. helm/benchmark/metrics/summac/model_summac.py +1 -1
  153. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  154. helm/benchmark/metrics/summarization_metrics.py +19 -9
  155. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  156. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  157. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  158. helm/benchmark/metrics/test_metric.py +1 -1
  159. helm/benchmark/metrics/test_statistic.py +2 -2
  160. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  161. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  162. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  163. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  164. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  165. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  167. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  169. helm/benchmark/metrics/unitxt_metrics.py +4 -1
  170. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  171. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  172. helm/benchmark/model_metadata_registry.py +16 -0
  173. helm/benchmark/presentation/summarize.py +23 -10
  174. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  175. helm/benchmark/reeval_run.py +203 -0
  176. helm/benchmark/reeval_runner.py +355 -0
  177. helm/benchmark/run.py +8 -17
  178. helm/benchmark/run_expander.py +78 -8
  179. helm/benchmark/run_spec_factory.py +12 -0
  180. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  181. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  182. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  183. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  184. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  185. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  186. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  187. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  188. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  189. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  190. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  191. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  192. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  193. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  194. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  195. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  196. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  197. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  198. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  199. helm/benchmark/run_specs/vlm_run_specs.py +75 -2
  200. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  201. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  202. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  203. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  204. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  205. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  206. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  207. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  208. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  209. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  210. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  211. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  212. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  213. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  214. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  215. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  216. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  217. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  218. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  219. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  220. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  221. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  222. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  223. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  224. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  225. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  226. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  227. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  228. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  229. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  230. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  231. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  232. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  233. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  234. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  235. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  236. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  237. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  238. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  239. helm/benchmark/scenarios/bold_scenario.py +1 -1
  240. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  241. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  242. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  243. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  244. helm/benchmark/scenarios/clear_scenario.py +153 -0
  245. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  246. helm/benchmark/scenarios/code_scenario.py +17 -4
  247. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  248. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  249. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  250. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  251. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  252. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  253. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  254. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  255. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  256. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  257. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  258. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  259. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  260. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  261. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  262. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  263. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  264. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  265. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  266. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  267. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  268. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  269. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  270. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  271. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  272. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  273. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  274. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  275. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  276. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  277. helm/benchmark/scenarios/ice_scenario.py +8 -4
  278. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  279. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  280. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  281. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  282. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  283. helm/benchmark/scenarios/koala_scenario.py +1 -1
  284. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  285. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  286. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  287. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  288. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  289. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  290. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  291. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  292. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  293. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  294. helm/benchmark/scenarios/math_scenario.py +9 -1
  295. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  296. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  297. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  298. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  299. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  300. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  301. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  302. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  303. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  304. helm/benchmark/scenarios/medec_scenario.py +120 -0
  305. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  306. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  307. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  308. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  309. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  310. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  311. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  312. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  313. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  314. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  315. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  316. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  317. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  318. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  319. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  320. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  321. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  322. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  323. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  324. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  325. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  326. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  328. helm/benchmark/scenarios/quac_scenario.py +10 -1
  329. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  330. helm/benchmark/scenarios/raft_scenario.py +17 -2
  331. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  332. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  333. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  334. helm/benchmark/scenarios/scenario.py +9 -1
  335. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  336. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  337. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  338. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  339. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  340. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  341. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  342. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  343. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  344. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  345. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  346. helm/benchmark/scenarios/spider_scenario.py +91 -0
  347. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  348. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  349. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  350. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  351. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  352. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  353. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  354. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  355. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  356. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  357. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  358. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  359. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  360. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  361. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  362. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  363. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  364. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  366. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  367. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  368. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  369. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  370. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  371. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  372. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  373. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  374. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  375. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  376. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  377. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  378. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  379. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  380. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  381. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  382. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  383. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  384. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  385. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  386. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  387. helm/benchmark/server.py +11 -0
  388. helm/benchmark/slurm_runner.py +1 -1
  389. helm/benchmark/static/schema_audio.yaml +752 -0
  390. helm/benchmark/static/schema_autobencher.yaml +150 -0
  391. helm/benchmark/static/schema_call_center.yaml +97 -60
  392. helm/benchmark/static/schema_capabilities.yaml +254 -0
  393. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  394. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  395. helm/benchmark/static/schema_enterprise.yaml +298 -0
  396. helm/benchmark/static/schema_finance.yaml +14 -12
  397. helm/benchmark/static/schema_heim.yaml +1389 -0
  398. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  399. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  400. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  401. helm/benchmark/static/schema_safety.yaml +18 -1
  402. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  403. helm/benchmark/static/schema_social_audio.yaml +224 -0
  404. helm/benchmark/static/schema_sql.yaml +171 -0
  405. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  406. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  407. helm/benchmark/static/schema_vhelm.yaml +109 -36
  408. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  409. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  410. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  411. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  412. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  413. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  414. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  415. helm/benchmark/static_build/config.js +1 -1
  416. helm/benchmark/static_build/index.html +5 -5
  417. helm/benchmark/window_services/default_window_service.py +1 -1
  418. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  419. helm/benchmark/window_services/ice_window_service.py +1 -1
  420. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  421. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  422. helm/benchmark/window_services/local_window_service.py +2 -2
  423. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  424. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  425. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  426. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  427. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  428. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  429. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  430. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  431. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  432. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  433. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  434. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  435. helm/benchmark/window_services/test_utils.py +1 -1
  436. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  437. helm/benchmark/window_services/yalm_window_service.py +1 -1
  438. helm/clients/ai21_client.py +3 -3
  439. helm/clients/aleph_alpha_client.py +1 -1
  440. helm/clients/audio_language/__init__.py +0 -0
  441. helm/clients/audio_language/diva_llama_client.py +118 -0
  442. helm/clients/audio_language/llama_omni_client.py +198 -0
  443. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  444. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  445. helm/clients/auto_client.py +4 -2
  446. helm/clients/azure_openai_client.py +55 -0
  447. helm/clients/bedrock_client.py +201 -7
  448. helm/clients/bedrock_utils.py +33 -0
  449. helm/clients/clip_scorers/clip_scorer.py +1 -1
  450. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  451. helm/clients/cohere_client.py +3 -3
  452. helm/clients/google_client.py +1 -1
  453. helm/clients/http_model_client.py +1 -1
  454. helm/clients/huggingface_client.py +10 -18
  455. helm/clients/ibm_client.py +267 -0
  456. helm/clients/image_generation/adobe_vision_client.py +1 -1
  457. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  458. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  459. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  460. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  461. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  462. helm/clients/image_generation/cogview2_client.py +1 -1
  463. helm/clients/image_generation/dalle2_client.py +1 -1
  464. helm/clients/image_generation/dalle3_client.py +2 -2
  465. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  466. helm/clients/image_generation/dalle_mini/data.py +1 -1
  467. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  468. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  469. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  470. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  471. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  472. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  473. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  474. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  475. helm/clients/image_generation/dalle_mini_client.py +1 -1
  476. helm/clients/image_generation/deep_floyd_client.py +1 -1
  477. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  478. helm/clients/image_generation/lexica_client.py +1 -1
  479. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  480. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  481. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  482. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  483. helm/clients/image_generation/mindalle_client.py +1 -1
  484. helm/clients/image_generation/together_image_generation_client.py +1 -1
  485. helm/clients/lit_gpt_client.py +2 -2
  486. helm/clients/mistral_client.py +62 -18
  487. helm/clients/nvidia_nim_client.py +0 -3
  488. helm/clients/openai_client.py +241 -22
  489. helm/clients/palmyra_client.py +1 -4
  490. helm/clients/reka_client.py +1 -1
  491. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  492. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  493. helm/clients/stanfordhealthcare_google_client.py +43 -0
  494. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  495. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  496. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  497. helm/clients/test_client.py +1 -1
  498. helm/clients/test_together_client.py +6 -1
  499. helm/clients/together_client.py +47 -7
  500. helm/clients/upstage_client.py +23 -0
  501. helm/clients/vertexai_client.py +39 -13
  502. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  503. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  504. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  505. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  506. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  507. helm/clients/vllm_client.py +4 -6
  508. helm/clients/yi_client.py +0 -3
  509. helm/common/audio_utils.py +111 -0
  510. helm/common/file_caches/local_file_cache.py +1 -1
  511. helm/common/file_caches/test_local_file_cache.py +1 -1
  512. helm/common/images_utils.py +2 -2
  513. helm/common/media_object.py +2 -2
  514. helm/common/multimodal_request_utils.py +26 -0
  515. helm/common/reeval_parameters.py +12 -0
  516. helm/common/request.py +6 -2
  517. helm/common/response_format.py +18 -0
  518. helm/common/test_media_object.py +1 -1
  519. helm/config/model_deployments.yaml +1112 -19
  520. helm/config/model_metadata.yaml +985 -44
  521. helm/config/tokenizer_configs.yaml +379 -3
  522. helm/proxy/cli.py +2 -2
  523. helm/proxy/example_queries.py +1 -1
  524. helm/proxy/server.py +11 -4
  525. helm/proxy/services/remote_service.py +1 -1
  526. helm/proxy/services/server_service.py +1 -1
  527. helm/proxy/services/test_remote_service.py +2 -2
  528. helm/proxy/services/test_service.py +1 -1
  529. helm/proxy/static/general.js +122 -0
  530. helm/proxy/static/help.html +99 -0
  531. helm/proxy/static/index.css +57 -0
  532. helm/proxy/static/index.html +40 -0
  533. helm/proxy/static/index.js +456 -0
  534. helm/proxy/static/info-icon.png +0 -0
  535. helm/proxy/test_retry.py +1 -1
  536. helm/proxy/token_counters/auto_token_counter.py +1 -1
  537. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  538. helm/tokenizers/caching_tokenizer.py +2 -30
  539. helm/tokenizers/http_model_tokenizer.py +1 -1
  540. helm/tokenizers/huggingface_tokenizer.py +2 -2
  541. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  542. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  543. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  544. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  545. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  546. helm/tokenizers/tokenizer.py +3 -1
  547. helm/tokenizers/yalm_tokenizer.py +3 -3
  548. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  549. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  550. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  551. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  552. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  553. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  554. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  555. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  556. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  557. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  558. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  559. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  560. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  561. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  562. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  563. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  564. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  565. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  566. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  567. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  568. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  569. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  570. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  571. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  572. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  573. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  574. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  575. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  576. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  577. helm/tokenizers/anthropic_tokenizer.py +0 -52
  578. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  579. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  580. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
@@ -1,32 +1,31 @@
1
1
  from dataclasses import replace
2
+ from functools import partial
2
3
  from typing import Callable, Dict, List, Optional, Set, Tuple, cast
4
+ import re
5
+ import string
6
+
7
+ from nltk.metrics.scores import f_measure
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.translate.bleu_score import sentence_bleu
10
+ from rouge_score import rouge_scorer
3
11
  import numpy as np
4
- from functools import partial
12
+
5
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
14
  from helm.benchmark.adaptation.request_state import RequestState
15
+ from helm.benchmark.metrics import code_metrics_helper
7
16
  from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
8
17
  from helm.benchmark.metrics.metric_name import MetricName
9
18
  from helm.benchmark.metrics.metric_service import MetricService
19
+ from helm.benchmark.metrics.nltk_helper import install_nltk_resources
10
20
  from helm.benchmark.metrics.statistic import Stat
11
21
  from helm.benchmark.scenarios.code_scenario import CodeReference
22
+ from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
12
23
  from helm.benchmark.scenarios.scenario import Reference
13
24
  from helm.common.optional_dependencies import handle_module_not_found_error
14
25
  from helm.common.request import GeneratedOutput
15
- from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
16
- from nltk.metrics.scores import f_measure
17
- from nltk.translate.bleu_score import sentence_bleu
18
- from nltk.tokenize import word_tokenize
19
- from rouge_score import rouge_scorer
20
- import re
21
- import string
22
- from . import code_metrics_helper
23
- import nltk
24
26
 
25
27
 
26
- try:
27
- nltk.data.find("tokenizers/punkt")
28
- except LookupError:
29
- nltk.download("punkt") # Required for rouge
28
+ install_nltk_resources()
30
29
 
31
30
 
32
31
  def pass_at_k_estimator(n: int, c: int, k: int) -> float:
@@ -40,7 +39,7 @@ def pass_at_k_estimator(n: int, c: int, k: int) -> float:
40
39
  return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
41
40
 
42
41
 
43
- def normalize_text(text: str) -> str:
42
+ def normalize_text(text: str, should_remove_articles: bool = True) -> str:
44
43
  """Lower text and remove punctuation, articles and extra whitespace.
45
44
  Copied from the [QuAC](http://quac.ai/) evaluation script found at
46
45
  https://s3.amazonaws.com/my89public/quac/scorer.py"""
@@ -58,7 +57,10 @@ def normalize_text(text: str) -> str:
58
57
  def lower(text: str) -> str:
59
58
  return text.lower()
60
59
 
61
- return white_space_fix(remove_articles(remove_punc(lower(text))))
60
+ normalized_text = remove_punc(lower(text))
61
+ if should_remove_articles:
62
+ normalized_text = remove_articles(normalized_text)
63
+ return white_space_fix(normalized_text)
62
64
 
63
65
 
64
66
  def exact_match(gold: str, pred: str) -> float:
@@ -75,6 +77,17 @@ def quasi_exact_match(gold: str, pred: str) -> float:
75
77
  return 1 if normalize_text(gold) == normalize_text(pred) else 0
76
78
 
77
79
 
80
+ def quasi_leave_articles_exact_match(gold: str, pred: str) -> float:
81
+ if not pred:
82
+ return 0
83
+
84
+ return (
85
+ 1
86
+ if normalize_text(gold, should_remove_articles=False) == normalize_text(pred, should_remove_articles=False)
87
+ else 0
88
+ )
89
+
90
+
78
91
  def prefix_exact_match(gold: str, pred: str) -> float:
79
92
  """
80
93
  The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
@@ -203,6 +216,112 @@ def cider(gold: str, pred: str) -> float:
203
216
  return average_score
204
217
 
205
218
 
219
+ def wer_score(gold: str, pred: str) -> float:
220
+ # Word Error Rate (WER), which is a common
221
+ # metric used to evaluate the accuracy of speech recognition systems.
222
+ # The lower the better. The WER might be greater than 1.
223
+ # https://huggingface.co/learn/audio-course/en/chapter5/evaluation#word-error-rate
224
+ try:
225
+ from jiwer import wer
226
+ except ModuleNotFoundError as e:
227
+ handle_module_not_found_error(e, ["audiolm"])
228
+
229
+ if not pred:
230
+ return 0
231
+ gold = normalize_text(gold, should_remove_articles=False)
232
+ pred = normalize_text(pred, should_remove_articles=False)
233
+ wer_ret = wer(gold, pred)
234
+ return wer_ret
235
+
236
+
237
+ def mer_score(gold: str, pred: str) -> float:
238
+ # Match Error Rate (MER), which is for evaluating the error rate of
239
+ # speech recognition systems. The lower the better.
240
+ try:
241
+ from jiwer import mer
242
+ except ModuleNotFoundError as e:
243
+ handle_module_not_found_error(e, ["audiolm"])
244
+
245
+ if not pred:
246
+ return 0
247
+
248
+ gold = normalize_text(gold, should_remove_articles=False)
249
+ pred = normalize_text(pred, should_remove_articles=False)
250
+ mer_ret = mer(gold, pred)
251
+ return mer_ret
252
+
253
+
254
+ def wip_score(gold: str, pred: str) -> float:
255
+ # Word information preservation (WIP) for evaluating the preserved information of speech
256
+ # recognition systems. The higher the better.
257
+ try:
258
+ from jiwer import wip
259
+ except ModuleNotFoundError as e:
260
+ handle_module_not_found_error(e, ["audiolm"])
261
+
262
+ if not pred:
263
+ return 0
264
+
265
+ gold = normalize_text(gold, should_remove_articles=False)
266
+ pred = normalize_text(pred, should_remove_articles=False)
267
+ wip_ret = wip(gold, pred)
268
+ return wip_ret
269
+
270
+
271
+ def cer_score(gold: str, pred: str) -> float:
272
+ # Character Error Rate (CER) for evaluating the accuracy
273
+ # of speech recognition systems. The lower the better.
274
+ try:
275
+ from jiwer import cer
276
+ except ModuleNotFoundError as e:
277
+ handle_module_not_found_error(e, ["audiolm"])
278
+
279
+ if not pred:
280
+ return 0
281
+
282
+ gold = normalize_text(gold, should_remove_articles=False)
283
+ pred = normalize_text(pred, should_remove_articles=False)
284
+ cer_ret = cer(gold, pred)
285
+ assert isinstance(cer_ret, float)
286
+ return cer_ret
287
+
288
+
289
+ def chinese_wer_score(gold: str, pred: str) -> float:
290
+ try:
291
+ import jieba
292
+ except ModuleNotFoundError as e:
293
+ handle_module_not_found_error(e, ["audiolm"])
294
+
295
+ return wer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
296
+
297
+
298
+ def chinese_mer_score(gold: str, pred: str) -> float:
299
+ try:
300
+ import jieba
301
+ except ModuleNotFoundError as e:
302
+ handle_module_not_found_error(e, ["audiolm"])
303
+
304
+ return mer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
305
+
306
+
307
+ def chinese_wip_score(gold: str, pred: str) -> float:
308
+ try:
309
+ import jieba
310
+ except ModuleNotFoundError as e:
311
+ handle_module_not_found_error(e, ["audiolm"])
312
+
313
+ return wip_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
314
+
315
+
316
+ def chinese_cer_score(gold: str, pred: str) -> float:
317
+ try:
318
+ import jieba
319
+ except ModuleNotFoundError as e:
320
+ handle_module_not_found_error(e, ["audiolm"])
321
+
322
+ return cer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
323
+
324
+
206
325
  def extract_set_from_text(
207
326
  set_str: str,
208
327
  set_start_str: str = " is ",
@@ -329,6 +448,7 @@ def compute_reference_metrics(
329
448
  metric_fn_mapping: Dict[str, Callable] = {
330
449
  "exact_match": exact_match,
331
450
  "quasi_exact_match": quasi_exact_match,
451
+ "quasi_leave_articles_exact_match": quasi_leave_articles_exact_match,
332
452
  "prefix_exact_match": prefix_exact_match,
333
453
  "quasi_prefix_exact_match": quasi_prefix_exact_match,
334
454
  "exact_match_indicator": exact_match_indicator,
@@ -352,6 +472,14 @@ def compute_reference_metrics(
352
472
  "chinese_rouge_2": get_chinese_rouge_function("rouge2"),
353
473
  "cleva_math_result_match": cleva_math_result_match,
354
474
  "absolute_value_difference": absolute_value_difference,
475
+ "wer_score": wer_score,
476
+ "mer_score": mer_score,
477
+ "wip_score": wip_score,
478
+ "cer_score": cer_score,
479
+ "chinese_wer_score": chinese_wer_score,
480
+ "chinese_mer_score": chinese_mer_score,
481
+ "chinese_wip_score": chinese_wip_score,
482
+ "chinese_cer_score": chinese_cer_score,
355
483
  }
356
484
 
357
485
  stats: List[Stat] = []
@@ -0,0 +1,103 @@
1
+ import re
2
+ from typing import List, Optional
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ def extract_answer(output_text: str) -> Optional[str]:
13
+ """
14
+ Extracts the answer from the output text using two exact regex patterns.
15
+ Returns None if no valid answer is found.
16
+
17
+ Args:
18
+ output_text (str): The text from which to extract the answer.
19
+
20
+ Returns:
21
+ Optional[str]: The extracted answer (A-J) if found, otherwise None.
22
+ """
23
+ # First regex: Matches "answer is (A-J)" with optional parentheses
24
+ match = re.search(r"answer is \(?([A-J])\)?", output_text)
25
+ if match:
26
+ return match.group(1)
27
+
28
+ # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
29
+ match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
30
+ if match:
31
+ return match.group(1)
32
+
33
+ # Third regex: Matches "answer is (A-J)" with optional leading non-alpha characters
34
+ match = re.search(r"correct answer is [^A-Za-z]*([A-J])", output_text)
35
+ if match:
36
+ return match.group(1)
37
+
38
+ # Fourth regex: Matches "answer is (A-J)" with optional leading non-capital alpha characters
39
+ match = re.search(r"correct answer is [^A-Z]*([A-J])", output_text)
40
+ if match:
41
+ return match.group(1)
42
+
43
+ # If no regex matches, return None
44
+ return None
45
+
46
+
47
+ class GPQAChainOfThoughtMetric(Metric):
48
+ """
49
+ This metric focuses on structured reasoning and the accuracy of extracted answers.
50
+ It compares model outputs against correct answers provided in a multiple-choice
51
+ format and returns a score indicating the correctness of the generated response.
52
+ """
53
+
54
+ def evaluate_generation(
55
+ self,
56
+ adapter_spec: AdapterSpec,
57
+ request_state: RequestState,
58
+ metric_service: MetricService,
59
+ eval_cache_path: str,
60
+ ) -> List[Stat]:
61
+ """
62
+ Evaluate the generated output for chain-of-thought reasoning accuracy.
63
+
64
+ The method extracts the model's output, determines the correct answer
65
+ from the provided references, and compares the two to compute a binary score.
66
+
67
+ Args:
68
+ adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
69
+ request_state (RequestState): The state of the current request, including
70
+ the input instance, output results, and references.
71
+ metric_service (MetricService): A service used to compute metrics if needed.
72
+ eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
73
+
74
+ Returns:
75
+ List[Stat]: A list containing a single `Stat` object with the correctness
76
+ score (1 for correct, 0 for incorrect) under the metric
77
+ name "chain_of_thought_correct".
78
+ """
79
+ # Assert that completions exist if the result is not None
80
+ assert (
81
+ request_state.result is not None and request_state.result.completions
82
+ ), "Request state result must have completions."
83
+
84
+ # Set output_text if the assertion passes
85
+ output_text = request_state.result.completions[0].text
86
+
87
+ # Extract the answer using the updated logic
88
+ extracted_answer = extract_answer(output_text)
89
+
90
+ # Find the correct answer from references by translating index to letter
91
+ correct_answer = None
92
+ for index, option in enumerate(request_state.instance.references):
93
+ if option.is_correct:
94
+ correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.)
95
+ break
96
+
97
+ # Raise an exception if no correct answer is found
98
+ if correct_answer is None:
99
+ raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")
100
+
101
+ # Compare extracted answer with the correct answer and compute the score
102
+ score = 1 if extracted_answer == correct_answer else 0
103
+ return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
@@ -0,0 +1,167 @@
1
+ from typing import Dict, List
2
+ import re
3
+
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
6
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
7
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
8
+ from helm.benchmark.metrics.metric_name import MetricContext, MetricName
9
+ from helm.benchmark.metrics.metric_service import MetricService
10
+ from helm.benchmark.metrics.statistic import Stat, merge_stat
11
+ from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType
12
+ from helm.common.hierarchical_logger import hlog
13
+ from helm.common.request import RequestResult, Request, GeneratedOutput
14
+
15
+
16
+ class GPT4AudioCritiqueMetric(MetricInterface):
17
+ """
18
+ Critique evaluation for evaluating how original the generated text are given the image by GPT4.
19
+ """
20
+
21
+ # We can add more evaluation aspects here
22
+ METRIC_NAME: str = "gpt4_audio_critique"
23
+ GPT4_AUDIO_CRITIQUE_PROMPT: str = """###Task Description:
24
+ A ground truth answer, a response from a model to evaluate, and a score rubric \
25
+ representing a evaluation criteria are given.
26
+ 1. Write a one-sentence feedback that assess the quality of the response \
27
+ strictly based on the given score rubric, not evaluating in general.
28
+ 2. After writing the one-sentence feedback, write a score that is an \
29
+ integer between 1 and 5. You should refer to the score rubric.
30
+ 3. Please do not generate any other opening, closing, and explanations.
31
+
32
+ ###The ground truth answer:
33
+ {{ground_truth}}
34
+
35
+ ###Model Response to evaluate:
36
+ {{orig_response}}
37
+
38
+ ###Score Rubrics:
39
+ [Does the predicted response align with the ground truth \
40
+ in terms of accuracy?]
41
+
42
+ Score 1: The response is completely inaccurate or unrelated to the ground truth.
43
+ Score 2: The response contains significant inaccuracies or misinterpretations \
44
+ that distort the meaning of the ground truth.
45
+ Score 3: The response is mostly accurate but includes minor errors, omissions, \
46
+ or ambiguities.
47
+ Score 4: The response is accurate and aligns well with the ground truth, with \
48
+ only slight room for improvement.
49
+ Score 5: The response is fully accurate and precisely matches the ground truth \
50
+ with no errors or misinterpretations.
51
+
52
+ Your response should be in the format:
53
+ ###Short Explanation: (explanation in only one sentence)
54
+ ###Rating: (int)
55
+ """
56
+
57
+ def __init__(self, num_respondents: int, max_tokens: int):
58
+ self._num_respondents = num_respondents
59
+ self._max_tokens = max_tokens
60
+
61
+ def __repr__(self) -> str:
62
+ return "GPT4AudioCritiqueMetric()"
63
+
64
+ def _extract_score_from_gpt_output(self, evaluator_response: str):
65
+ """
66
+ Extract the score from the evaluator response. Refer to the official Vibe-Eval implementation:
67
+ https://github.com/reka-ai/reka-vibe-eval/blob/3852d4712da172a7b85dddeffc4f9c3482a6f4c9/evaluate.py#L159-#L164
68
+ """
69
+ re_match = re.search(r"Rating:\s*([1-5])", evaluator_response)
70
+ if re_match is None:
71
+ hlog(f"Error parsing answer: {evaluator_response}. Skipping question (and so the respondent entirely)")
72
+ return None
73
+ return int(re_match.group(1))
74
+
75
+ def evaluate(
76
+ self,
77
+ scenario_state: ScenarioState,
78
+ metric_service: MetricService,
79
+ eval_cache_path: str,
80
+ parallelism: int,
81
+ ) -> MetricResult:
82
+ request_states: List[RequestState] = scenario_state.request_states
83
+
84
+ all_stats: Dict[MetricName, Stat] = {}
85
+ per_instance_stats: List[PerInstanceStats] = []
86
+ for request_state in request_states:
87
+ context = MetricContext.from_instance(request_state.instance)
88
+ stats_without_context = self.evaluate_generation(
89
+ scenario_state.adapter_spec,
90
+ request_state,
91
+ metric_service,
92
+ eval_cache_path,
93
+ )
94
+ stats = [add_context(stat_without_context, context) for stat_without_context in stats_without_context]
95
+ for stat in stats:
96
+ merge_stat(all_stats, stat)
97
+ assert request_state.instance.id is not None
98
+ per_instance_stats.append(
99
+ PerInstanceStats(
100
+ instance_id=request_state.instance.id,
101
+ perturbation=request_state.instance.perturbation,
102
+ train_trial_index=request_state.train_trial_index,
103
+ stats=stats,
104
+ )
105
+ )
106
+ return MetricResult(aggregated_stats=list(all_stats.values()), per_instance_stats=per_instance_stats)
107
+
108
+ def evaluate_generation(
109
+ self,
110
+ adapter_spec: AdapterSpec,
111
+ request_state: RequestState,
112
+ metric_service: MetricService,
113
+ eval_cache_path: str,
114
+ ) -> List[Stat]:
115
+ input_request: Request = request_state.request
116
+ # Predicted outputs and their originality scores
117
+ assert request_state.result is not None
118
+ request_result: RequestResult = request_state.result
119
+ # Get input image and generated response for the originality evaluation
120
+ assert input_request.multimodal_prompt is not None
121
+ completions: List[GeneratedOutput] = request_result.completions
122
+ generated_text: str = completions[0].text
123
+ ref_text: str = request_state.instance.references[0].output.text
124
+
125
+ template = CritiqueTaskTemplate(
126
+ name="ahelm_gpt4_audio_critique",
127
+ instructions=self.GPT4_AUDIO_CRITIQUE_PROMPT,
128
+ num_respondents=self._num_respondents,
129
+ max_tokens=self._max_tokens,
130
+ questions=[
131
+ CritiqueQuestionTemplate(
132
+ name=self.METRIC_NAME,
133
+ question_type=QuestionType.FREE_RESPONSE,
134
+ text="",
135
+ options=[],
136
+ # media_object=image_objects[0], # we only take the first image as input
137
+ )
138
+ ],
139
+ )
140
+
141
+ request = CritiqueRequest(
142
+ template=template,
143
+ fields={
144
+ "ground_truth": ref_text,
145
+ "orig_response": generated_text,
146
+ },
147
+ )
148
+
149
+ # send to critique request
150
+ result = metric_service.make_critique_request(request)
151
+ if not result or not result.responses:
152
+ # Skip computing metrics if there aren't any responses yet
153
+ hlog("Waiting for responses to be generated.")
154
+ return []
155
+
156
+ stats: Dict[str, Stat] = {}
157
+ for question in template.questions:
158
+ stats[question.name] = Stat(MetricName(question.name))
159
+
160
+ for response in result.responses:
161
+ for answer_name, answer in response.answers.items():
162
+ assert isinstance(answer, str)
163
+ answer_value: float
164
+ answer_value = self._extract_score_from_gpt_output(answer)
165
+ stats[answer_name].add(answer_value)
166
+
167
+ return list(stats.values())
@@ -0,0 +1,36 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class HelpdeskCallSummarizationMetric(Metric):
12
+ """Score metrics for helpdesk call summarization."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ annotations: Dict[str, Any] = request_state.annotations["helpdesk_call_center_summarization"]
23
+ scores: List[int] = []
24
+ for annotation_key, annotation_value in annotations.items():
25
+ if annotation_key.endswith("_score") and annotation_value is not None:
26
+ scores.append(annotation_value)
27
+ if not scores:
28
+ raise ValueError(
29
+ "Could not compute score in HelpdeskCallSummarizationMetric because all annotators failed."
30
+ )
31
+ score = sum(scores) / len(scores)
32
+ # normalize score
33
+ score = (score - 1) / 9
34
+ return [
35
+ Stat(MetricName("call_summarization_score")).add(score),
36
+ ]
File without changes