crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -9,9 +9,9 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
9
9
  from helm.benchmark.window_services.window_service import WindowService
10
10
  from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
11
11
  from helm.benchmark.window_services.tokenizer_service import TokenizerService
12
- from .metric_name import MetricName
13
- from .metric_service import MetricService
14
- from .statistic import Stat
12
+ from helm.benchmark.metrics.metric_name import MetricName
13
+ from helm.benchmark.metrics.metric_service import MetricService
14
+ from helm.benchmark.metrics.statistic import Stat
15
15
 
16
16
 
17
17
  EFFICIENCY_DATA_PACKAGE: str = "helm.benchmark.efficiency_data"
@@ -0,0 +1,103 @@
1
+ from typing import List
2
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.metrics.metric import Metric
5
+ from helm.benchmark.metrics.metric_name import MetricName
6
+ from helm.benchmark.metrics.metric_service import MetricService
7
+ from helm.benchmark.metrics.statistic import Stat
8
+ from helm.common.hierarchical_logger import hlog
9
+
10
+
11
+ class EhrSqlMetric(Metric):
12
+ """
13
+ Metric for evaluating the EHR SQL dataset, focusing on:
14
+ 1. Execution Accuracy – Whether the generated SQL query produces the same results as the ground truth.
15
+ 2. Query Validity – Whether the generated SQL executes without errors.
16
+ 3. Precision for Answerable Questions (Pans).
17
+ 4. Recall for Answerable Questions (Rans).
18
+ """
19
+
20
+ def evaluate_generation(
21
+ self,
22
+ adapter_spec: AdapterSpec,
23
+ request_state: RequestState,
24
+ metric_service: MetricService,
25
+ eval_cache_path: str,
26
+ ) -> List[Stat]:
27
+ """
28
+ Evaluate execution accuracy, query validity, and answerability metrics.
29
+ """
30
+
31
+ if not request_state.annotations:
32
+ hlog(f"Warning: Request state missing annotations for instance {request_state.instance}")
33
+ return []
34
+
35
+ if "ehr_sql" not in request_state.annotations:
36
+ hlog(f"Warning: 'ehr_sql' key missing in annotations for instance {request_state.instance}")
37
+ return []
38
+
39
+ # Extract execution results
40
+ predicted_result = request_state.annotations["ehr_sql"].get("predicted_result", [])
41
+ ground_truth_result = request_state.annotations["ehr_sql"].get("ground_truth_result", [])
42
+ query_error = request_state.annotations["ehr_sql"].get("query_error", None)
43
+
44
+ # Extract predictions from the model output
45
+ if request_state.result is None:
46
+ predictions = []
47
+ else:
48
+ predictions = [completion.text.strip() for completion in request_state.result.completions]
49
+ if not predictions:
50
+ hlog(f"Warning: No predictions found in the completions for instance {request_state.instance}")
51
+ return []
52
+
53
+ # Process the first prediction as the primary output
54
+ prediction = predictions[0].strip()
55
+
56
+ # Extract references and input text
57
+ references = getattr(request_state.instance, "references", None)
58
+
59
+ if not references or len(references) == 0:
60
+ hlog(f"Warning: Missing references for instance {request_state.instance}")
61
+ return []
62
+
63
+ # Check if the ground truth is answerable based on `is_impossible` flag
64
+ ground_truth_query = references[0].output.text.strip() if references else None
65
+ is_impossible = (
66
+ request_state.instance.extra_data.get("is_impossible", False)
67
+ if request_state.instance.extra_data
68
+ else False
69
+ )
70
+
71
+ is_answerable = not is_impossible and bool(ground_truth_query) # True if the ground truth is answerable
72
+ is_predicted_answerable = bool(prediction) # True if the model generated a non-empty SQL query
73
+ correct_answerable = int(is_answerable and is_predicted_answerable) # Correct if both are answerable
74
+
75
+ # **Execution Accuracy Fix:**
76
+ execution_accuracy = 0
77
+
78
+ if ground_truth_query:
79
+ if ground_truth_result and predicted_result:
80
+ execution_accuracy = int(set(predicted_result) == set(ground_truth_result)) # Compare sets.
81
+ elif not ground_truth_result and not predicted_result and not prediction:
82
+ execution_accuracy = 1 # Both empty and no query was generated.
83
+ elif not ground_truth_query and prediction:
84
+ execution_accuracy = 0 # LLM generated a query when no gold query exists.
85
+
86
+ # **Query Validity Fix:**
87
+ if not prediction: # No SQL query was generated
88
+ query_validity = 0
89
+ elif query_error is None:
90
+ query_validity = 1 # Query executed successfully.
91
+ else:
92
+ query_validity = 0 # Execution error occurred.
93
+
94
+ return [
95
+ # Execution-based Metrics
96
+ Stat(MetricName("ehr_sql_execution_accuracy")).add(execution_accuracy),
97
+ Stat(MetricName("ehr_sql_query_validity")).add(query_validity),
98
+ # Answerability Metrics
99
+ Stat(MetricName("ehr_sql_precision_answerable")).add(correct_answerable if is_predicted_answerable else 0),
100
+ Stat(MetricName("ehr_sql_recall_answerable")).add(correct_answerable if is_answerable else 0),
101
+ Stat(MetricName("ehr_sql_total_predicted_answerable")).add(int(is_predicted_answerable)),
102
+ Stat(MetricName("ehr_sql_total_ground_truth_answerable")).add(int(is_answerable)),
103
+ ]
@@ -6,9 +6,9 @@ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, add_con
6
6
 
7
7
  from helm.benchmark.adaptation.scenario_state import ScenarioState
8
8
  from helm.benchmark.adaptation.request_state import RequestState
9
- from .metric_name import MetricName, MetricContext
10
- from .metric_service import MetricService
11
- from .statistic import Stat, merge_stat
9
+ from helm.benchmark.metrics.metric_name import MetricName, MetricContext
10
+ from helm.benchmark.metrics.metric_service import MetricService
11
+ from helm.benchmark.metrics.statistic import Stat, merge_stat
12
12
 
13
13
 
14
14
  class EvaluateInstancesMetric(MetricInterface, ABC):
@@ -1,32 +1,31 @@
1
1
  from dataclasses import replace
2
+ from functools import partial
2
3
  from typing import Callable, Dict, List, Optional, Set, Tuple, cast
4
+ import re
5
+ import string
6
+
7
+ from nltk.metrics.scores import f_measure
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.translate.bleu_score import sentence_bleu
10
+ from rouge_score import rouge_scorer
3
11
  import numpy as np
4
- from functools import partial
12
+
5
13
  from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
14
  from helm.benchmark.adaptation.request_state import RequestState
15
+ from helm.benchmark.metrics import code_metrics_helper
7
16
  from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer
8
17
  from helm.benchmark.metrics.metric_name import MetricName
9
18
  from helm.benchmark.metrics.metric_service import MetricService
19
+ from helm.benchmark.metrics.nltk_helper import install_nltk_resources
10
20
  from helm.benchmark.metrics.statistic import Stat
11
21
  from helm.benchmark.scenarios.code_scenario import CodeReference
22
+ from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
12
23
  from helm.benchmark.scenarios.scenario import Reference
13
24
  from helm.common.optional_dependencies import handle_module_not_found_error
14
25
  from helm.common.request import GeneratedOutput
15
- from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
16
- from nltk.metrics.scores import f_measure
17
- from nltk.translate.bleu_score import sentence_bleu
18
- from nltk.tokenize import word_tokenize
19
- from rouge_score import rouge_scorer
20
- import re
21
- import string
22
- from . import code_metrics_helper
23
- import nltk
24
26
 
25
27
 
26
- try:
27
- nltk.data.find("tokenizers/punkt")
28
- except LookupError:
29
- nltk.download("punkt") # Required for rouge
28
+ install_nltk_resources()
30
29
 
31
30
 
32
31
  def pass_at_k_estimator(n: int, c: int, k: int) -> float:
@@ -40,7 +39,7 @@ def pass_at_k_estimator(n: int, c: int, k: int) -> float:
40
39
  return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
41
40
 
42
41
 
43
- def normalize_text(text: str) -> str:
42
+ def normalize_text(text: str, should_remove_articles: bool = True) -> str:
44
43
  """Lower text and remove punctuation, articles and extra whitespace.
45
44
  Copied from the [QuAC](http://quac.ai/) evaluation script found at
46
45
  https://s3.amazonaws.com/my89public/quac/scorer.py"""
@@ -58,7 +57,10 @@ def normalize_text(text: str) -> str:
58
57
  def lower(text: str) -> str:
59
58
  return text.lower()
60
59
 
61
- return white_space_fix(remove_articles(remove_punc(lower(text))))
60
+ normalized_text = remove_punc(lower(text))
61
+ if should_remove_articles:
62
+ normalized_text = remove_articles(normalized_text)
63
+ return white_space_fix(normalized_text)
62
64
 
63
65
 
64
66
  def exact_match(gold: str, pred: str) -> float:
@@ -75,6 +77,17 @@ def quasi_exact_match(gold: str, pred: str) -> float:
75
77
  return 1 if normalize_text(gold) == normalize_text(pred) else 0
76
78
 
77
79
 
80
+ def quasi_leave_articles_exact_match(gold: str, pred: str) -> float:
81
+ if not pred:
82
+ return 0
83
+
84
+ return (
85
+ 1
86
+ if normalize_text(gold, should_remove_articles=False) == normalize_text(pred, should_remove_articles=False)
87
+ else 0
88
+ )
89
+
90
+
78
91
  def prefix_exact_match(gold: str, pred: str) -> float:
79
92
  """
80
93
  The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
@@ -203,6 +216,112 @@ def cider(gold: str, pred: str) -> float:
203
216
  return average_score
204
217
 
205
218
 
219
+ def wer_score(gold: str, pred: str) -> float:
220
+ # Word Error Rate (WER), which is a common
221
+ # metric used to evaluate the accuracy of speech recognition systems.
222
+ # The lower the better. The WER might be greater than 1.
223
+ # https://huggingface.co/learn/audio-course/en/chapter5/evaluation#word-error-rate
224
+ try:
225
+ from jiwer import wer
226
+ except ModuleNotFoundError as e:
227
+ handle_module_not_found_error(e, ["audiolm"])
228
+
229
+ if not pred:
230
+ return 0
231
+ gold = normalize_text(gold, should_remove_articles=False)
232
+ pred = normalize_text(pred, should_remove_articles=False)
233
+ wer_ret = wer(gold, pred)
234
+ return wer_ret
235
+
236
+
237
+ def mer_score(gold: str, pred: str) -> float:
238
+ # Match Error Rate (MER), which is for evaluating the error rate of
239
+ # speech recognition systems. The lower the better.
240
+ try:
241
+ from jiwer import mer
242
+ except ModuleNotFoundError as e:
243
+ handle_module_not_found_error(e, ["audiolm"])
244
+
245
+ if not pred:
246
+ return 0
247
+
248
+ gold = normalize_text(gold, should_remove_articles=False)
249
+ pred = normalize_text(pred, should_remove_articles=False)
250
+ mer_ret = mer(gold, pred)
251
+ return mer_ret
252
+
253
+
254
+ def wip_score(gold: str, pred: str) -> float:
255
+ # Word information preservation (WIP) for evaluating the preserved information of speech
256
+ # recognition systems. The higher the better.
257
+ try:
258
+ from jiwer import wip
259
+ except ModuleNotFoundError as e:
260
+ handle_module_not_found_error(e, ["audiolm"])
261
+
262
+ if not pred:
263
+ return 0
264
+
265
+ gold = normalize_text(gold, should_remove_articles=False)
266
+ pred = normalize_text(pred, should_remove_articles=False)
267
+ wip_ret = wip(gold, pred)
268
+ return wip_ret
269
+
270
+
271
+ def cer_score(gold: str, pred: str) -> float:
272
+ # Character Error Rate (CER) for evaluating the accuracy
273
+ # of speech recognition systems. The lower the better.
274
+ try:
275
+ from jiwer import cer
276
+ except ModuleNotFoundError as e:
277
+ handle_module_not_found_error(e, ["audiolm"])
278
+
279
+ if not pred:
280
+ return 0
281
+
282
+ gold = normalize_text(gold, should_remove_articles=False)
283
+ pred = normalize_text(pred, should_remove_articles=False)
284
+ cer_ret = cer(gold, pred)
285
+ assert isinstance(cer_ret, float)
286
+ return cer_ret
287
+
288
+
289
+ def chinese_wer_score(gold: str, pred: str) -> float:
290
+ try:
291
+ import jieba
292
+ except ModuleNotFoundError as e:
293
+ handle_module_not_found_error(e, ["audiolm"])
294
+
295
+ return wer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
296
+
297
+
298
+ def chinese_mer_score(gold: str, pred: str) -> float:
299
+ try:
300
+ import jieba
301
+ except ModuleNotFoundError as e:
302
+ handle_module_not_found_error(e, ["audiolm"])
303
+
304
+ return mer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
305
+
306
+
307
+ def chinese_wip_score(gold: str, pred: str) -> float:
308
+ try:
309
+ import jieba
310
+ except ModuleNotFoundError as e:
311
+ handle_module_not_found_error(e, ["audiolm"])
312
+
313
+ return wip_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
314
+
315
+
316
+ def chinese_cer_score(gold: str, pred: str) -> float:
317
+ try:
318
+ import jieba
319
+ except ModuleNotFoundError as e:
320
+ handle_module_not_found_error(e, ["audiolm"])
321
+
322
+ return cer_score(" ".join(jieba.cut(gold)), " ".join(jieba.cut(pred)))
323
+
324
+
206
325
  def extract_set_from_text(
207
326
  set_str: str,
208
327
  set_start_str: str = " is ",
@@ -329,6 +448,7 @@ def compute_reference_metrics(
329
448
  metric_fn_mapping: Dict[str, Callable] = {
330
449
  "exact_match": exact_match,
331
450
  "quasi_exact_match": quasi_exact_match,
451
+ "quasi_leave_articles_exact_match": quasi_leave_articles_exact_match,
332
452
  "prefix_exact_match": prefix_exact_match,
333
453
  "quasi_prefix_exact_match": quasi_prefix_exact_match,
334
454
  "exact_match_indicator": exact_match_indicator,
@@ -352,6 +472,14 @@ def compute_reference_metrics(
352
472
  "chinese_rouge_2": get_chinese_rouge_function("rouge2"),
353
473
  "cleva_math_result_match": cleva_math_result_match,
354
474
  "absolute_value_difference": absolute_value_difference,
475
+ "wer_score": wer_score,
476
+ "mer_score": mer_score,
477
+ "wip_score": wip_score,
478
+ "cer_score": cer_score,
479
+ "chinese_wer_score": chinese_wer_score,
480
+ "chinese_mer_score": chinese_mer_score,
481
+ "chinese_wip_score": chinese_wip_score,
482
+ "chinese_cer_score": chinese_cer_score,
355
483
  }
356
484
 
357
485
  stats: List[Stat] = []
@@ -0,0 +1,103 @@
1
+ import re
2
+ from typing import List, Optional
3
+
4
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.metrics.metric import Metric
7
+ from helm.benchmark.metrics.metric_name import MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat
10
+
11
+
12
+ def extract_answer(output_text: str) -> Optional[str]:
13
+ """
14
+ Extracts the answer from the output text using two exact regex patterns.
15
+ Returns None if no valid answer is found.
16
+
17
+ Args:
18
+ output_text (str): The text from which to extract the answer.
19
+
20
+ Returns:
21
+ Optional[str]: The extracted answer (A-J) if found, otherwise None.
22
+ """
23
+ # First regex: Matches "answer is (A-J)" with optional parentheses
24
+ match = re.search(r"answer is \(?([A-J])\)?", output_text)
25
+ if match:
26
+ return match.group(1)
27
+
28
+ # Second regex: Matches "[answer: (A-J)]" with optional leading characters like "."
29
+ match = re.search(r"\.*\[aA\]nswer:\s*\(?([A-J])\)?", output_text)
30
+ if match:
31
+ return match.group(1)
32
+
33
+ # Third regex: Matches "answer is (A-J)" with optional leading non-alpha characters
34
+ match = re.search(r"correct answer is [^A-Za-z]*([A-J])", output_text)
35
+ if match:
36
+ return match.group(1)
37
+
38
+ # Fourth regex: Matches "answer is (A-J)" with optional leading non-capital alpha characters
39
+ match = re.search(r"correct answer is [^A-Z]*([A-J])", output_text)
40
+ if match:
41
+ return match.group(1)
42
+
43
+ # If no regex matches, return None
44
+ return None
45
+
46
+
47
+ class GPQAChainOfThoughtMetric(Metric):
48
+ """
49
+ This metric focuses on structured reasoning and the accuracy of extracted answers.
50
+ It compares model outputs against correct answers provided in a multiple-choice
51
+ format and returns a score indicating the correctness of the generated response.
52
+ """
53
+
54
+ def evaluate_generation(
55
+ self,
56
+ adapter_spec: AdapterSpec,
57
+ request_state: RequestState,
58
+ metric_service: MetricService,
59
+ eval_cache_path: str,
60
+ ) -> List[Stat]:
61
+ """
62
+ Evaluate the generated output for chain-of-thought reasoning accuracy.
63
+
64
+ The method extracts the model's output, determines the correct answer
65
+ from the provided references, and compares the two to compute a binary score.
66
+
67
+ Args:
68
+ adapter_spec (AdapterSpec): Specification of the adapter used for the evaluation.
69
+ request_state (RequestState): The state of the current request, including
70
+ the input instance, output results, and references.
71
+ metric_service (MetricService): A service used to compute metrics if needed.
72
+ eval_cache_path (str): Path to the evaluation cache for storing or retrieving data.
73
+
74
+ Returns:
75
+ List[Stat]: A list containing a single `Stat` object with the correctness
76
+ score (1 for correct, 0 for incorrect) under the metric
77
+ name "chain_of_thought_correct".
78
+ """
79
+ # Assert that completions exist if the result is not None
80
+ assert (
81
+ request_state.result is not None and request_state.result.completions
82
+ ), "Request state result must have completions."
83
+
84
+ # Set output_text if the assertion passes
85
+ output_text = request_state.result.completions[0].text
86
+
87
+ # Extract the answer using the updated logic
88
+ extracted_answer = extract_answer(output_text)
89
+
90
+ # Find the correct answer from references by translating index to letter
91
+ correct_answer = None
92
+ for index, option in enumerate(request_state.instance.references):
93
+ if option.is_correct:
94
+ correct_answer = chr(65 + index) # Translate index (0 -> A, 1 -> B, etc.)
95
+ break
96
+
97
+ # Raise an exception if no correct answer is found
98
+ if correct_answer is None:
99
+ raise ValueError(f"No correct answer found for instance ID {request_state.instance.id}")
100
+
101
+ # Compare extracted answer with the correct answer and compute the score
102
+ score = 1 if extracted_answer == correct_answer else 0
103
+ return [Stat(MetricName("chain_of_thought_correctness")).add(score)]
@@ -0,0 +1,167 @@
1
+ from typing import Dict, List
2
+ import re
3
+
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
6
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
7
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
8
+ from helm.benchmark.metrics.metric_name import MetricContext, MetricName
9
+ from helm.benchmark.metrics.metric_service import MetricService
10
+ from helm.benchmark.metrics.statistic import Stat, merge_stat
11
+ from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType
12
+ from helm.common.hierarchical_logger import hlog
13
+ from helm.common.request import RequestResult, Request, GeneratedOutput
14
+
15
+
16
+ class GPT4AudioCritiqueMetric(MetricInterface):
17
+ """
18
+ Critique evaluation for evaluating how original the generated text are given the image by GPT4.
19
+ """
20
+
21
+ # We can add more evaluation aspects here
22
+ METRIC_NAME: str = "gpt4_audio_critique"
23
+ GPT4_AUDIO_CRITIQUE_PROMPT: str = """###Task Description:
24
+ A ground truth answer, a response from a model to evaluate, and a score rubric \
25
+ representing a evaluation criteria are given.
26
+ 1. Write a one-sentence feedback that assess the quality of the response \
27
+ strictly based on the given score rubric, not evaluating in general.
28
+ 2. After writing the one-sentence feedback, write a score that is an \
29
+ integer between 1 and 5. You should refer to the score rubric.
30
+ 3. Please do not generate any other opening, closing, and explanations.
31
+
32
+ ###The ground truth answer:
33
+ {{ground_truth}}
34
+
35
+ ###Model Response to evaluate:
36
+ {{orig_response}}
37
+
38
+ ###Score Rubrics:
39
+ [Does the predicted response align with the ground truth \
40
+ in terms of accuracy?]
41
+
42
+ Score 1: The response is completely inaccurate or unrelated to the ground truth.
43
+ Score 2: The response contains significant inaccuracies or misinterpretations \
44
+ that distort the meaning of the ground truth.
45
+ Score 3: The response is mostly accurate but includes minor errors, omissions, \
46
+ or ambiguities.
47
+ Score 4: The response is accurate and aligns well with the ground truth, with \
48
+ only slight room for improvement.
49
+ Score 5: The response is fully accurate and precisely matches the ground truth \
50
+ with no errors or misinterpretations.
51
+
52
+ Your response should be in the format:
53
+ ###Short Explanation: (explanation in only one sentence)
54
+ ###Rating: (int)
55
+ """
56
+
57
+ def __init__(self, num_respondents: int, max_tokens: int):
58
+ self._num_respondents = num_respondents
59
+ self._max_tokens = max_tokens
60
+
61
+ def __repr__(self) -> str:
62
+ return "GPT4AudioCritiqueMetric()"
63
+
64
+ def _extract_score_from_gpt_output(self, evaluator_response: str):
65
+ """
66
+ Extract the score from the evaluator response. Refer to the official Vibe-Eval implementation:
67
+ https://github.com/reka-ai/reka-vibe-eval/blob/3852d4712da172a7b85dddeffc4f9c3482a6f4c9/evaluate.py#L159-#L164
68
+ """
69
+ re_match = re.search(r"Rating:\s*([1-5])", evaluator_response)
70
+ if re_match is None:
71
+ hlog(f"Error parsing answer: {evaluator_response}. Skipping question (and so the respondent entirely)")
72
+ return None
73
+ return int(re_match.group(1))
74
+
75
+ def evaluate(
76
+ self,
77
+ scenario_state: ScenarioState,
78
+ metric_service: MetricService,
79
+ eval_cache_path: str,
80
+ parallelism: int,
81
+ ) -> MetricResult:
82
+ request_states: List[RequestState] = scenario_state.request_states
83
+
84
+ all_stats: Dict[MetricName, Stat] = {}
85
+ per_instance_stats: List[PerInstanceStats] = []
86
+ for request_state in request_states:
87
+ context = MetricContext.from_instance(request_state.instance)
88
+ stats_without_context = self.evaluate_generation(
89
+ scenario_state.adapter_spec,
90
+ request_state,
91
+ metric_service,
92
+ eval_cache_path,
93
+ )
94
+ stats = [add_context(stat_without_context, context) for stat_without_context in stats_without_context]
95
+ for stat in stats:
96
+ merge_stat(all_stats, stat)
97
+ assert request_state.instance.id is not None
98
+ per_instance_stats.append(
99
+ PerInstanceStats(
100
+ instance_id=request_state.instance.id,
101
+ perturbation=request_state.instance.perturbation,
102
+ train_trial_index=request_state.train_trial_index,
103
+ stats=stats,
104
+ )
105
+ )
106
+ return MetricResult(aggregated_stats=list(all_stats.values()), per_instance_stats=per_instance_stats)
107
+
108
+ def evaluate_generation(
109
+ self,
110
+ adapter_spec: AdapterSpec,
111
+ request_state: RequestState,
112
+ metric_service: MetricService,
113
+ eval_cache_path: str,
114
+ ) -> List[Stat]:
115
+ input_request: Request = request_state.request
116
+ # Predicted outputs and their originality scores
117
+ assert request_state.result is not None
118
+ request_result: RequestResult = request_state.result
119
+ # Get input image and generated response for the originality evaluation
120
+ assert input_request.multimodal_prompt is not None
121
+ completions: List[GeneratedOutput] = request_result.completions
122
+ generated_text: str = completions[0].text
123
+ ref_text: str = request_state.instance.references[0].output.text
124
+
125
+ template = CritiqueTaskTemplate(
126
+ name="ahelm_gpt4_audio_critique",
127
+ instructions=self.GPT4_AUDIO_CRITIQUE_PROMPT,
128
+ num_respondents=self._num_respondents,
129
+ max_tokens=self._max_tokens,
130
+ questions=[
131
+ CritiqueQuestionTemplate(
132
+ name=self.METRIC_NAME,
133
+ question_type=QuestionType.FREE_RESPONSE,
134
+ text="",
135
+ options=[],
136
+ # media_object=image_objects[0], # we only take the first image as input
137
+ )
138
+ ],
139
+ )
140
+
141
+ request = CritiqueRequest(
142
+ template=template,
143
+ fields={
144
+ "ground_truth": ref_text,
145
+ "orig_response": generated_text,
146
+ },
147
+ )
148
+
149
+ # send to critique request
150
+ result = metric_service.make_critique_request(request)
151
+ if not result or not result.responses:
152
+ # Skip computing metrics if there aren't any responses yet
153
+ hlog("Waiting for responses to be generated.")
154
+ return []
155
+
156
+ stats: Dict[str, Stat] = {}
157
+ for question in template.questions:
158
+ stats[question.name] = Stat(MetricName(question.name))
159
+
160
+ for response in result.responses:
161
+ for answer_name, answer in response.answers.items():
162
+ assert isinstance(answer, str)
163
+ answer_value: float
164
+ answer_value = self._extract_score_from_gpt_output(answer)
165
+ stats[answer_name].add(answer_value)
166
+
167
+ return list(stats.values())
@@ -0,0 +1,36 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class HelpdeskCallSummarizationMetric(Metric):
12
+ """Score metrics for helpdesk call summarization."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ annotations: Dict[str, Any] = request_state.annotations["helpdesk_call_center_summarization"]
23
+ scores: List[int] = []
24
+ for annotation_key, annotation_value in annotations.items():
25
+ if annotation_key.endswith("_score") and annotation_value is not None:
26
+ scores.append(annotation_value)
27
+ if not scores:
28
+ raise ValueError(
29
+ "Could not compute score in HelpdeskCallSummarizationMetric because all annotators failed."
30
+ )
31
+ score = sum(scores) / len(scores)
32
+ # normalize score
33
+ score = (score - 1) / 9
34
+ return [
35
+ Stat(MetricName("call_summarization_score")).add(score),
36
+ ]