crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show
  1. crfm_helm-0.5.6.dist-info/METADATA +427 -0
  2. crfm_helm-0.5.6.dist-info/RECORD +941 -0
  3. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
  22. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  23. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  24. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  25. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  26. helm/benchmark/adaptation/prompt.py +1 -1
  27. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  28. helm/benchmark/annotation/air_bench_annotator.py +21 -6
  29. helm/benchmark/annotation/annotator.py +5 -0
  30. helm/benchmark/annotation/annotator_factory.py +3 -20
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
  36. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  37. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  38. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  39. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  40. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  41. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  42. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  43. helm/benchmark/annotation/medalign_annotator.py +100 -0
  44. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  45. helm/benchmark/annotation/medication_qa_annotator.py +87 -63
  46. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  47. helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
  48. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  49. helm/benchmark/annotation/model_as_judge.py +214 -6
  50. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  51. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  52. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  54. helm/benchmark/annotation/omni_math_annotator.py +131 -0
  55. helm/benchmark/annotation/spider_annotator.py +18 -0
  56. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  57. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  58. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  59. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  60. helm/benchmark/annotation_executor.py +35 -15
  61. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  62. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  63. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  64. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  65. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  66. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  67. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  68. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  69. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  70. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  71. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  72. helm/benchmark/augmentations/perturbation.py +1 -1
  73. helm/benchmark/augmentations/space_perturbation.py +2 -2
  74. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  75. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  76. helm/benchmark/augmentations/test_perturbation.py +16 -13
  77. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  78. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  79. helm/benchmark/data_preprocessor.py +2 -2
  80. helm/benchmark/executor.py +11 -12
  81. helm/benchmark/huggingface_registration.py +2 -7
  82. helm/benchmark/metrics/aci_bench_metrics.py +14 -0
  83. helm/benchmark/metrics/basic_metrics.py +6 -6
  84. helm/benchmark/metrics/bbq_metrics.py +2 -2
  85. helm/benchmark/metrics/bias_metrics.py +12 -3
  86. helm/benchmark/metrics/bias_word_lists.py +1 -1
  87. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  88. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  89. helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
  90. helm/benchmark/metrics/classification_metrics.py +76 -12
  91. helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
  92. helm/benchmark/metrics/code_metrics.py +5 -5
  93. helm/benchmark/metrics/comet_metric.py +125 -0
  94. helm/benchmark/metrics/common_metric_specs.py +9 -2
  95. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  96. helm/benchmark/metrics/copyright_metrics.py +4 -4
  97. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  98. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  99. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  101. helm/benchmark/metrics/dischargeme_metrics.py +14 -0
  102. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  103. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  104. helm/benchmark/metrics/efficiency_metrics.py +6 -6
  105. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  106. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  107. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  108. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  109. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  110. helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/__init__.py +0 -0
  113. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  115. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  116. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  117. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  118. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  120. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  122. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  123. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  124. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  125. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  126. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  127. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  128. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  129. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  130. helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
  131. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  132. helm/benchmark/metrics/llm_jury_metrics.py +46 -0
  133. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  134. helm/benchmark/metrics/med_dialog_metrics.py +14 -0
  135. helm/benchmark/metrics/medalign_metrics.py +14 -0
  136. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  137. helm/benchmark/metrics/medec_metrics.py +101 -0
  138. helm/benchmark/metrics/medi_qa_metrics.py +14 -0
  139. helm/benchmark/metrics/medication_qa_metrics.py +10 -19
  140. helm/benchmark/metrics/melt_bias_metric.py +234 -0
  141. helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
  142. helm/benchmark/metrics/melt_metric_specs.py +43 -0
  143. helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
  144. helm/benchmark/metrics/mental_health_metrics.py +14 -0
  145. helm/benchmark/metrics/metric.py +3 -3
  146. helm/benchmark/metrics/metric_service.py +11 -11
  147. helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
  148. helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
  149. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  150. helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
  151. helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
  152. helm/benchmark/metrics/nltk_helper.py +32 -0
  153. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  154. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  155. helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
  156. helm/benchmark/metrics/output_processing_metric.py +60 -0
  157. helm/benchmark/metrics/output_processors.py +15 -0
  158. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  159. helm/benchmark/metrics/ranking_metrics.py +3 -3
  160. helm/benchmark/metrics/reference_metric.py +3 -3
  161. helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
  162. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  163. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  164. helm/benchmark/metrics/spider_metrics.py +7 -0
  165. helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
  166. helm/benchmark/metrics/statistic.py +1 -1
  167. helm/benchmark/metrics/summac/model_summac.py +2 -3
  168. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  169. helm/benchmark/metrics/summarization_metrics.py +20 -9
  170. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  171. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  172. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  173. helm/benchmark/metrics/test_metric.py +1 -1
  174. helm/benchmark/metrics/test_statistic.py +2 -2
  175. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  176. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  177. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  178. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  179. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  180. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  181. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  182. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
  183. helm/benchmark/metrics/toxicity_metrics.py +6 -6
  184. helm/benchmark/metrics/unitxt_metrics.py +7 -5
  185. helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
  186. helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
  187. helm/benchmark/metrics/vision_language/image_utils.py +2 -2
  188. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  189. helm/benchmark/model_deployment_registry.py +6 -8
  190. helm/benchmark/model_metadata_registry.py +16 -0
  191. helm/benchmark/presentation/contamination.py +3 -3
  192. helm/benchmark/presentation/create_plots.py +33 -12
  193. helm/benchmark/presentation/run_display.py +13 -0
  194. helm/benchmark/presentation/schema.py +2 -1
  195. helm/benchmark/presentation/summarize.py +97 -67
  196. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  197. helm/benchmark/reeval_run.py +202 -0
  198. helm/benchmark/reeval_runner.py +355 -0
  199. helm/benchmark/run.py +86 -90
  200. helm/benchmark/run_expander.py +90 -9
  201. helm/benchmark/run_spec_factory.py +13 -0
  202. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  203. helm/benchmark/run_specs/audio_run_specs.py +657 -0
  204. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  205. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  206. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  207. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  208. helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
  209. helm/benchmark/run_specs/experimental_run_specs.py +142 -3
  210. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  211. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  212. helm/benchmark/run_specs/long_context_run_specs.py +141 -0
  213. helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
  214. helm/benchmark/run_specs/melt_run_specs.py +783 -0
  215. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  216. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  217. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  218. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
  219. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
  220. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  221. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  222. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  223. helm/benchmark/run_specs/vlm_run_specs.py +103 -2
  224. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  225. helm/benchmark/runner.py +5 -5
  226. helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
  227. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  228. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  229. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  230. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  231. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
  232. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  233. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  234. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  235. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  236. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  237. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  238. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  239. helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
  240. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  241. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  242. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  243. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  244. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  245. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  246. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  247. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  248. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  249. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  250. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  251. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  252. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
  253. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
  254. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
  255. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
  256. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
  257. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  258. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
  259. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  260. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  261. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  262. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  263. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  264. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  265. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  266. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  267. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  268. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  269. helm/benchmark/scenarios/bold_scenario.py +1 -1
  270. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  271. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  272. helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
  273. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  274. helm/benchmark/scenarios/clear_scenario.py +157 -0
  275. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  276. helm/benchmark/scenarios/code_scenario.py +17 -4
  277. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  278. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  279. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  280. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  281. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  282. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  283. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  284. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  285. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  286. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  287. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  288. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  289. helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
  290. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  291. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  292. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  293. helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
  294. helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
  295. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  296. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  297. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  298. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  299. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  300. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  301. helm/benchmark/scenarios/grammar.py +2 -2
  302. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  303. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  304. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  305. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  306. helm/benchmark/scenarios/headqa_scenario.py +136 -0
  307. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  308. helm/benchmark/scenarios/ice_scenario.py +8 -4
  309. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  310. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  311. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  312. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
  313. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
  314. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  315. helm/benchmark/scenarios/koala_scenario.py +1 -1
  316. helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
  317. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  318. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  319. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  320. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  321. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  322. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  323. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  324. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  325. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  326. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  327. helm/benchmark/scenarios/math_scenario.py +9 -1
  328. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  329. helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
  330. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  331. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  332. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  333. helm/benchmark/scenarios/medalign_scenario.py +94 -0
  334. helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
  335. helm/benchmark/scenarios/medbullets_scenario.py +145 -0
  336. helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
  337. helm/benchmark/scenarios/medec_scenario.py +125 -0
  338. helm/benchmark/scenarios/medhallu_scenario.py +72 -0
  339. helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
  340. helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
  341. helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
  342. helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
  343. helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
  344. helm/benchmark/scenarios/melt_scenarios.py +793 -0
  345. helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
  346. helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
  347. helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
  348. helm/benchmark/scenarios/mental_health_scenario.py +123 -0
  349. helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
  350. helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
  351. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
  352. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  353. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  354. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  355. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  356. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
  357. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
  358. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
  359. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  360. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  361. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  362. helm/benchmark/scenarios/numeracy_scenario.py +12 -2
  363. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  364. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  365. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  366. helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
  367. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  368. helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
  369. helm/benchmark/scenarios/quac_scenario.py +10 -1
  370. helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
  371. helm/benchmark/scenarios/raft_scenario.py +17 -2
  372. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  373. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  374. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  375. helm/benchmark/scenarios/scenario.py +9 -1
  376. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
  377. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  378. helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
  379. helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
  380. helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
  381. helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
  382. helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
  383. helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
  384. helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
  385. helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
  386. helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
  387. helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
  388. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  389. helm/benchmark/scenarios/spider_scenario.py +91 -0
  390. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
  391. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  392. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  393. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  394. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  395. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  396. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  397. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  398. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  399. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  400. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  401. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  402. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  403. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  404. helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
  405. helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
  406. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  407. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  408. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  409. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  410. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  411. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  412. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  413. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  414. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  415. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  416. helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
  417. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  418. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  419. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  420. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  421. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  422. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  423. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  424. helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
  425. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  426. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  427. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  428. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  429. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  430. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  431. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  432. helm/benchmark/server.py +13 -1
  433. helm/benchmark/slurm_runner.py +1 -1
  434. helm/benchmark/static/schema_audio.yaml +763 -0
  435. helm/benchmark/static/schema_autobencher.yaml +150 -0
  436. helm/benchmark/static/schema_call_center.yaml +97 -60
  437. helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
  438. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  439. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  440. helm/benchmark/static/schema_enterprise.yaml +319 -0
  441. helm/benchmark/static/schema_finance.yaml +14 -12
  442. helm/benchmark/static/schema_heim.yaml +1389 -0
  443. helm/benchmark/static/schema_long_context.yaml +283 -0
  444. helm/benchmark/static/schema_medhelm.yaml +1140 -0
  445. helm/benchmark/static/schema_melt.yaml +1257 -0
  446. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  447. helm/benchmark/static/schema_safety.yaml +18 -1
  448. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
  449. helm/benchmark/static/schema_slphelm.yaml +162 -0
  450. helm/benchmark/static/schema_social_audio.yaml +224 -0
  451. helm/benchmark/static/schema_sql.yaml +171 -0
  452. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
  453. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  454. helm/benchmark/static/schema_vhelm.yaml +129 -56
  455. helm/benchmark/static/schema_video.yaml +219 -0
  456. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  457. helm/benchmark/static_build/assets/index-94295e78.js +10 -0
  458. helm/benchmark/static_build/assets/index-b9779128.css +1 -0
  459. helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
  460. helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
  461. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  462. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  463. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
  464. helm/benchmark/static_build/config.js +1 -1
  465. helm/benchmark/static_build/index.html +6 -6
  466. helm/benchmark/window_services/default_window_service.py +1 -1
  467. helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
  468. helm/benchmark/window_services/ice_window_service.py +1 -1
  469. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  470. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  471. helm/benchmark/window_services/local_window_service.py +2 -2
  472. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  473. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  474. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  475. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  476. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  477. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  478. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  479. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  480. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  481. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  482. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  483. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  484. helm/benchmark/window_services/test_utils.py +4 -5
  485. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  486. helm/benchmark/window_services/tokenizer_service.py +7 -8
  487. helm/benchmark/window_services/yalm_window_service.py +1 -1
  488. helm/clients/ai21_client.py +3 -3
  489. helm/clients/aleph_alpha_client.py +1 -1
  490. helm/clients/anthropic_client.py +69 -29
  491. helm/clients/audio_language/__init__.py +0 -0
  492. helm/clients/audio_language/diva_llama_client.py +120 -0
  493. helm/clients/audio_language/llama_omni_client.py +198 -0
  494. helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
  495. helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
  496. helm/clients/audio_language/qwen_audiolm_client.py +152 -0
  497. helm/clients/audio_language/test.py +62 -0
  498. helm/clients/auto_client.py +4 -2
  499. helm/clients/azure_openai_client.py +55 -0
  500. helm/clients/bedrock_client.py +203 -7
  501. helm/clients/bedrock_utils.py +33 -0
  502. helm/clients/client.py +7 -7
  503. helm/clients/clip_scorers/clip_scorer.py +1 -1
  504. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  505. helm/clients/cohere_client.py +3 -3
  506. helm/clients/google_client.py +1 -1
  507. helm/clients/grok_client.py +36 -0
  508. helm/clients/http_model_client.py +1 -1
  509. helm/clients/huggingface_client.py +52 -21
  510. helm/clients/huggingface_pipeline_client.py +138 -0
  511. helm/clients/ibm_client.py +267 -0
  512. helm/clients/image_generation/adobe_vision_client.py +1 -1
  513. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  514. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  515. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  516. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  517. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  518. helm/clients/image_generation/cogview2_client.py +1 -1
  519. helm/clients/image_generation/dalle2_client.py +1 -1
  520. helm/clients/image_generation/dalle3_client.py +2 -2
  521. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  522. helm/clients/image_generation/dalle_mini/data.py +1 -1
  523. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  524. helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
  525. helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
  526. helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
  527. helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
  528. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  529. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  530. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  531. helm/clients/image_generation/dalle_mini_client.py +1 -1
  532. helm/clients/image_generation/deep_floyd_client.py +1 -1
  533. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  534. helm/clients/image_generation/lexica_client.py +1 -1
  535. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  536. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  537. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  538. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  539. helm/clients/image_generation/mindalle_client.py +1 -1
  540. helm/clients/image_generation/together_image_generation_client.py +1 -1
  541. helm/clients/lit_gpt_client.py +2 -2
  542. helm/clients/mistral_client.py +62 -18
  543. helm/clients/nvidia_nim_client.py +0 -3
  544. helm/clients/openai_client.py +308 -43
  545. helm/clients/openai_responses_client.py +174 -0
  546. helm/clients/palmyra_client.py +3 -9
  547. helm/clients/reka_client.py +3 -3
  548. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  549. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  550. helm/clients/stanfordhealthcare_google_client.py +43 -0
  551. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  552. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  553. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  554. helm/clients/test_client.py +1 -1
  555. helm/clients/test_together_client.py +6 -1
  556. helm/clients/together_client.py +76 -9
  557. helm/clients/upstage_client.py +23 -0
  558. helm/clients/vertexai_client.py +45 -13
  559. helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
  560. helm/clients/vision_language/huggingface_vlm_client.py +2 -2
  561. helm/clients/vision_language/idefics_client.py +6 -2
  562. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  563. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  564. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  565. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  566. helm/clients/vision_language/paligemma_client.py +2 -2
  567. helm/clients/vision_language/qwen2_vlm_client.py +188 -0
  568. helm/clients/vision_language/qwen_vlm_client.py +7 -5
  569. helm/clients/vllm_client.py +4 -6
  570. helm/clients/writer_client.py +102 -0
  571. helm/clients/yi_client.py +0 -3
  572. helm/common/audio_utils.py +111 -0
  573. helm/common/context.py +80 -0
  574. helm/common/credentials_utils.py +5 -5
  575. helm/common/file_caches/local_file_cache.py +1 -1
  576. helm/common/file_caches/test_local_file_cache.py +1 -1
  577. helm/common/general.py +9 -2
  578. helm/common/hierarchical_logger.py +46 -3
  579. helm/common/images_utils.py +2 -2
  580. helm/common/local_context.py +140 -0
  581. helm/common/media_object.py +2 -2
  582. helm/common/multimodal_request_utils.py +26 -0
  583. helm/common/reeval_parameters.py +12 -0
  584. helm/common/remote_context.py +61 -0
  585. helm/common/request.py +14 -2
  586. helm/common/response_format.py +18 -0
  587. helm/common/test_media_object.py +1 -1
  588. helm/config/model_deployments.yaml +1792 -28
  589. helm/config/model_metadata.yaml +1606 -51
  590. helm/config/tokenizer_configs.yaml +521 -4
  591. helm/proxy/cli.py +5 -3
  592. helm/proxy/critique/mechanical_turk_utils.py +1 -1
  593. helm/proxy/example_queries.py +1 -1
  594. helm/proxy/server.py +11 -4
  595. helm/proxy/services/remote_service.py +1 -1
  596. helm/proxy/services/server_service.py +22 -86
  597. helm/proxy/services/test_remote_service.py +2 -2
  598. helm/proxy/services/test_service.py +1 -1
  599. helm/proxy/static/general.js +122 -0
  600. helm/proxy/static/help.html +99 -0
  601. helm/proxy/static/index.css +57 -0
  602. helm/proxy/static/index.html +40 -0
  603. helm/proxy/static/index.js +456 -0
  604. helm/proxy/static/info-icon.png +0 -0
  605. helm/proxy/test_retry.py +1 -1
  606. helm/proxy/token_counters/auto_token_counter.py +1 -1
  607. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  608. helm/tokenizers/caching_tokenizer.py +2 -30
  609. helm/tokenizers/grok_tokenizer.py +53 -0
  610. helm/tokenizers/http_model_tokenizer.py +1 -1
  611. helm/tokenizers/huggingface_tokenizer.py +3 -3
  612. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  613. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  614. helm/tokenizers/test_grok_tokenizer.py +33 -0
  615. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  616. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  617. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  618. helm/tokenizers/tokenizer.py +3 -1
  619. helm/tokenizers/yalm_tokenizer.py +3 -3
  620. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  621. crfm_helm-0.5.4.dist-info/METADATA +0 -350
  622. crfm_helm-0.5.4.dist-info/RECORD +0 -697
  623. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  624. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  625. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  626. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  627. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  628. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  629. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  630. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  631. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  632. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  633. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  634. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  635. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  636. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  637. helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
  638. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  639. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  640. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  641. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  642. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  643. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  644. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  645. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  646. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  647. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  648. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  649. helm/tokenizers/anthropic_tokenizer.py +0 -52
  650. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
  651. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
  652. {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,93 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
8
+ generated clinical note given unstructured clinical text.
9
+ Your goal is to assess how well the generated response captures the necessary information
10
+ and follows provided instructions in terms of accuracy, structure, and clarity.
11
+
12
+ The user's request will be provided in these tags:
13
+ <user_request>
14
+ {{QUESTION}}
15
+ </user_request>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ Carefully analyze the <response>.
23
+ For each of the following categories, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
24
+ and provide a short justification for your score.
25
+
26
+ Evaluation Criteria:
27
+ Accuracy (1-5)
28
+ - Is all the information in the response factually correct?
29
+
30
+ Structure (1-5)
31
+ - Does the response contain all parts for the provided note generation structure?
32
+
33
+ Clarity (1-5)
34
+ - Is the response easy to understand for a clinician?
35
+
36
+ Output Format:
37
+ Output the evaluation as a single valid JSON object matching the following structure:
38
+ {
39
+ "accuracy": {
40
+ "score": 0,
41
+ "explanation": "Explain why this score was given."
42
+ },
43
+ "structure": {
44
+ "score": 0,
45
+ "explanation": "Explain why this score was given."
46
+ },
47
+ "clarity": {
48
+ "score": 0,
49
+ "explanation": "Explain why this score was given."
50
+ }
51
+ }
52
+
53
+ Ensure the output is valid JSON:
54
+ - Use **double quotes** (") for all keys and string values.
55
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
56
+ maintain valid JSON formatting.
57
+ - Do not include any additional information in the output.
58
+ """
59
+
60
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
61
+ "accuracy": {"score", "explanation"},
62
+ "structure": {"score", "explanation"},
63
+ "clarity": {"score", "explanation"},
64
+ }
65
+
66
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
67
+ "gpt": AnnotatorModelInfo(
68
+ model_name="openai/gpt-4o-2024-05-13",
69
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
70
+ ),
71
+ "llama": AnnotatorModelInfo(
72
+ model_name="meta/llama-3.3-70b-instruct",
73
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
74
+ ),
75
+ "claude": AnnotatorModelInfo(
76
+ model_name="anthropic/claude-3-7-sonnet-20250219",
77
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
78
+ ),
79
+ }
80
+
81
+
82
+ class CHWCarePlanAnnotator(LLMAsJuryAnnotator):
83
+ """The CHWCarePlan autograder."""
84
+
85
+ name = "chw_care_plan"
86
+
87
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
88
+ super().__init__(
89
+ auto_client=auto_client,
90
+ prompt_template=PROMPT_TEMPLATE,
91
+ annotation_criteria=ANNOTATION_CRITERIA,
92
+ annotator_models=ANNOTATOR_MODELS,
93
+ )
@@ -0,0 +1,78 @@
1
+ import os
2
+ import sqlite3
3
+ import threading
4
+ from typing import Any, Optional, Tuple
5
+
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.annotation.annotator import Annotator
8
+ from helm.common.general import ensure_directory_exists, ensure_file_downloaded
9
+
10
+
11
+ class CzechBankQAAnnotator(Annotator):
12
+ """The CzechBankQA autograder.
13
+
14
+ MUST BE RUN WITH --num-threads 1 FOR SOME REASON"""
15
+
16
+ name = "czech_bank_qa"
17
+
18
+ DATABASE_SOURCE_URL = (
19
+ "https://huggingface.co/datasets/yifanmai/czech_bank_qa/resolve/main/czech_bank.db?download=true"
20
+ )
21
+
22
+ def __init__(self, file_storage_path: str):
23
+ super().__init__()
24
+
25
+ cache_dir = os.path.join(file_storage_path, "data")
26
+ ensure_directory_exists(cache_dir)
27
+ file_name = "czech_bank.db"
28
+ file_path = os.path.join(cache_dir, file_name)
29
+ ensure_file_downloaded(source_url=CzechBankQAAnnotator.DATABASE_SOURCE_URL, target_path=file_path)
30
+ database = sqlite3.connect(file_path)
31
+
32
+ # csv_files_dir = "/home/yifanmai/oss/helm-scenarios/1999-czech-bank"
33
+ # # table_name_to_file_name = {
34
+ # # "account": "account.csv",
35
+ # # "client": "client.csv",
36
+ # # "disposition": "disp.csv",
37
+ # # "permenant_order": "order.csv",
38
+ # # "transaction": "trans.csv",
39
+ # # "loan": "loan.csv",
40
+ # # "credit_card": "card.csv",
41
+ # # "demographic_data": "district.csv"
42
+ # # }
43
+ # for file_name in os.listdir(csv_files_dir):
44
+ # file_path = os.path.join(csv_files_dir, file_name)
45
+ # df = pd.read_csv(file_path)
46
+ # table_name = file_name.removesuffix(".csv")
47
+ # df.to_sql(table_name, database, if_exists="append", index=False)
48
+ # print("Commited to SQL")
49
+ # # df.to_sql(table_name, conn, if_exists='append', index=False)
50
+
51
+ self.database = database
52
+ self.lock = threading.Lock()
53
+
54
+ def get_result(self, query: str) -> Tuple[Optional[str], Optional[str]]:
55
+ result: Optional[str] = None
56
+ error: Optional[str] = None
57
+ try:
58
+ cursor = self.database.cursor()
59
+ cursor.execute("PRAGMA query_only = TRUE")
60
+ cursor.execute(query)
61
+ result = str(cursor.fetchall())
62
+ cursor.close()
63
+ except (sqlite3.DatabaseError, sqlite3.Warning) as e:
64
+ error = str(e)
65
+ return (result, error)
66
+
67
+ def annotate(self, request_state: RequestState) -> Any:
68
+ assert request_state.result
69
+ assert len(request_state.result.completions) == 1
70
+
71
+ assert len(request_state.instance.references) == 1
72
+ gold_query = request_state.instance.references[0].output.text
73
+ query = request_state.result.completions[0].text
74
+ query = query.replace("```sql", "").replace("```", "")
75
+ result, error = self.get_result(query)
76
+ gold_result, gold_error = self.get_result(gold_query)
77
+
78
+ return {"query": query, "result": result, "error": error, "gold_result": gold_result, "gold_error": gold_error}
@@ -0,0 +1,107 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert responsible for evaluating a hospital document.
8
+ The task requires generating either discharge instructions or a brief hospital course based
9
+ on the provided discharge summary and radiology report.
10
+
11
+ Your goal is to assess whether the generated text is clinically accurate, complete, and clear
12
+ for the intended document type. The evaluation should ensure the document aligns with the
13
+ gold response in terms of accuracy, completeness, and clarity.
14
+
15
+ The target task of either generating a discharge instruction or brief hospital course along with
16
+ the patient discharge text and radiology report will be provided in these tags:
17
+ <patient_information>
18
+ {{QUESTION}}
19
+ </patient_information>
20
+
21
+
22
+ The document will be provided in these tags:
23
+ <response>
24
+ {{RESPONSE}}
25
+ </response>
26
+
27
+ The gold standard target document (either discharge instructions or a brief hospital course)
28
+ will be provided in these tags:
29
+ <gold_response>
30
+ {{GOLD_RESPONSE}}
31
+ </gold_response>
32
+
33
+ Carefully analyze the <response> based on the <patient_information> and compare
34
+ it to the <gold_response> when necessary.
35
+
36
+ For each of the following categories, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent)
37
+ and provide a brief justification for the score.
38
+
39
+ Evaluation Criteria:
40
+ Accuracy (1-5)
41
+ - Does the document provide correct medical information based on the patient's condition and source materials?
42
+
43
+ Completeness (1-5)
44
+ - Does the document include all important information needed for the specific document type?
45
+
46
+ Clarity (1-5)
47
+ - Is the document easy to understand for the right audience — patients for discharge
48
+ instructions or clinicians for the hospital course?
49
+
50
+ Output Format:
51
+ Generate a valid JSON object with the following structure:
52
+ {
53
+ "accuracy": {
54
+ "score": 0,
55
+ "explanation": "Explain why this score was given."
56
+ },
57
+ "completeness": {
58
+ "score": 0,
59
+ "explanation": "Explain why this score was given."
60
+ },
61
+ "clarity": {
62
+ "score": 0,
63
+ "explanation": "Explain why this score was given."
64
+ }
65
+ }
66
+
67
+ Ensure the output is valid JSON:
68
+ - Use **double quotes** (") for all keys and string values.
69
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
70
+ maintain valid JSON formatting.
71
+ - Do not include any additional information in the output.
72
+ """
73
+
74
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
75
+ "accuracy": {"score", "explanation"},
76
+ "completeness": {"score", "explanation"},
77
+ "clarity": {"score", "explanation"},
78
+ }
79
+
80
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
81
+ "gpt": AnnotatorModelInfo(
82
+ model_name="openai/gpt-4o-2024-05-13",
83
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
84
+ ),
85
+ "llama": AnnotatorModelInfo(
86
+ model_name="meta/llama-3.3-70b-instruct",
87
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
88
+ ),
89
+ "claude": AnnotatorModelInfo(
90
+ model_name="anthropic/claude-3-7-sonnet-20250219",
91
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
92
+ ),
93
+ }
94
+
95
+
96
+ class DischargeMeAnnotator(LLMAsJuryAnnotator):
97
+ """The DischargeMe autograder."""
98
+
99
+ name = "dischargeme"
100
+
101
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
102
+ super().__init__(
103
+ auto_client=auto_client,
104
+ prompt_template=PROMPT_TEMPLATE,
105
+ annotation_criteria=ANNOTATION_CRITERIA,
106
+ annotator_models=ANNOTATOR_MODELS,
107
+ )
@@ -0,0 +1,87 @@
1
+ from typing import Any, List, Optional
2
+ import os
3
+ import re
4
+ import sqlite3
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.annotation.annotator import Annotator
7
+ from helm.common.hierarchical_logger import hwarn
8
+ from helm.benchmark.runner import get_benchmark_output_path
9
+
10
+
11
+ class EhrSqlAnnotator(Annotator):
12
+ """
13
+ Executes both ground truth and generated SQL queries on the eicu.sqlite database.
14
+ """
15
+
16
+ name = "ehr_sql"
17
+
18
+ def annotate(self, request_state: RequestState) -> Any:
19
+ """Evaluate SQL execution accuracy by running queries against the eicu.sqlite database."""
20
+
21
+ databases_root_path = os.path.join(get_benchmark_output_path(), "scenarios", "ehr_sql")
22
+ database_path = os.path.join(databases_root_path, "eicu.sqlite")
23
+
24
+ assert len(request_state.instance.references) == 1
25
+ ground_truth_sql = request_state.instance.references[0].output.text.strip()
26
+ ground_truth_result: List[str] = []
27
+
28
+ # Execute the ground truth query
29
+ try:
30
+ with sqlite3.connect(database_path) as conn:
31
+ cursor = conn.cursor()
32
+ cursor.execute(ground_truth_sql)
33
+ ground_truth_result = cursor.fetchall()
34
+ except (sqlite3.OperationalError, sqlite3.Warning) as e:
35
+ hwarn(f"Ground truth SQL failed with error: {e}")
36
+
37
+ # If ground truth SQL execution didn't return results, attempt to use extra_data["value"]
38
+ if not ground_truth_result and request_state.instance.extra_data is not None:
39
+ if "value" in request_state.instance.extra_data:
40
+ extra_values = list(request_state.instance.extra_data["value"].values())
41
+
42
+ # Try inferring types from the database schema if possible
43
+ with sqlite3.connect(database_path) as conn:
44
+ cursor = conn.cursor()
45
+ try:
46
+ cursor.execute(ground_truth_sql)
47
+ fetched_result = cursor.fetchone()
48
+ if fetched_result:
49
+ # Convert extra_values to match SQLite's expected types
50
+ converted_values = [
51
+ type(fetched_result[i])(extra_values[i]) for i in range(len(extra_values))
52
+ ]
53
+ ground_truth_result = converted_values
54
+ else:
55
+ # If no rows were fetched, use `extra_values` as-is
56
+ ground_truth_result = extra_values
57
+ except sqlite3.OperationalError:
58
+ # If query fails (syntax error, etc.), just use `extra_values` as-is
59
+ ground_truth_result = extra_values
60
+
61
+ assert request_state.result is not None
62
+ assert len(request_state.result.completions) == 1
63
+ predicted_text = request_state.result.completions[0].text.strip()
64
+
65
+ predicted_sql_match = re.search(r"<\s*sql\s*>(.*?)<\/?\s*sql\s*>", predicted_text, re.DOTALL | re.IGNORECASE)
66
+ predicted_sql = predicted_sql_match.group(1).strip() if predicted_sql_match else predicted_text.strip()
67
+
68
+ predicted_result: List[str] = []
69
+ query_error: Optional[str] = None
70
+ predicted_sql = predicted_sql.replace("`", "").strip()
71
+ predicted_sql = re.sub(r"^sql\n", "", predicted_sql, flags=re.MULTILINE)
72
+ if not predicted_sql:
73
+ query_error = "No query generated"
74
+ else:
75
+ try:
76
+ with sqlite3.connect(database_path) as conn:
77
+ cursor = conn.cursor()
78
+ cursor.execute(predicted_sql)
79
+ predicted_result = cursor.fetchall()
80
+ except (sqlite3.OperationalError, sqlite3.Warning) as e:
81
+ query_error = str(e)
82
+
83
+ return {
84
+ "predicted_result": predicted_result,
85
+ "ground_truth_result": ground_truth_result,
86
+ "query_error": query_error,
87
+ }
@@ -0,0 +1,131 @@
1
+ import re
2
+ from typing import Any, Dict, Optional, Union
3
+
4
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
5
+ from helm.benchmark.adaptation.request_state import RequestState
6
+ from helm.benchmark.annotation.annotator import Annotator
7
+ from helm.clients.auto_client import AutoClient
8
+ from helm.common.hierarchical_logger import hwarn
9
+ from helm.common.request import Request
10
+ from helm.proxy.retry import NonRetriableException
11
+
12
+
13
+ class AnnotatorResponseParseFailure(NonRetriableException):
14
+ def __init__(self, response_text: str, **kwargs):
15
+ self.response_text = response_text
16
+ super().__init__(kwargs)
17
+
18
+
19
+ class HelpdeskCallSummarizationAnnotator(Annotator):
20
+ """The Helpdesk Call Summarization autograder."""
21
+
22
+ name = "helpdesk_call_center_summarization"
23
+
24
+ SHORT_NAME_TO_MODEL_INFO: Dict[str, AnnotatorModelInfo] = {
25
+ "gpt": AnnotatorModelInfo(
26
+ model_name="openai/gpt-4o-2024-05-13",
27
+ model_deployment="openai/gpt-4o-2024-05-13",
28
+ ),
29
+ "llama": AnnotatorModelInfo(
30
+ model_name="meta/llama-3.1-405b-instruct-turbo",
31
+ model_deployment="together/llama-3.1-405b-instruct-turbo",
32
+ ),
33
+ "claude": AnnotatorModelInfo(
34
+ model_name="anthropic/claude-3-5-sonnet-20241022",
35
+ model_deployment="anthropic/claude-3-5-sonnet-20241022",
36
+ ),
37
+ }
38
+
39
+ # Template loosely based on these references:
40
+ # - https://ieeexplore.ieee.org/abstract/document/9946852
41
+ # - https://arxiv.org/abs/2409.02413v1
42
+ PROMPT_TEMPLATE = """You are an expert evaluator. Your task is to evaluate the quality of a model-generated summary of a helpdesk call transcript.
43
+
44
+ The helpdesk call transcript and summary are provided below, delineated with start and end tags:
45
+
46
+ <call_transcript>
47
+ {{QUESTION}}
48
+ </call_transcript>
49
+ <summary>
50
+ {{PRED}}
51
+ </summary>
52
+
53
+ Evaluate the summary based on the following criteria:
54
+
55
+ - Conciseness: A high-quality summary should effectively convey the most important information from the call transcript while keeping the length brief.
56
+ - Relevance: The information presented in the summary should be relevant to the main topic.
57
+ - Coherence: A good summary should have a clear structure and flow of ideas that make it easy to understand and follow.
58
+ - Accuracy: The summary's information should be factually correct and should not contain false or misleading information.
59
+ - Completeness: The summary should a fair description of the main problems (concerns) and resolutions based on the original call transcripts.
60
+
61
+ Think step by step, then score the summary. Your reasoning should be less than 200 words. The score should be a single number between 1 to 10 inclusive.
62
+
63
+ Please respond with your output and reasoning in the following format, your reasoning within <reasoning></reasoning> tags and your score within <score></score> tags, without any other output:
64
+
65
+ <reasoning>INSERT_YOUR_REASONING_HERE</reasoning>
66
+ <score>INSERT_YOUR_SCORE_HERE</score>
67
+ """ # noqa: E501
68
+
69
+ PATTERN = r"^\s*reason:(.*)##(.*)"
70
+
71
+ def __init__(self, auto_client: AutoClient):
72
+ self._auto_client = auto_client
73
+
74
+ def annotate(self, request_state: RequestState) -> Any:
75
+ assert request_state.result
76
+ assert len(request_state.result.completions) == 1
77
+ prediction_text = request_state.result.completions[0].text
78
+
79
+ question_text = request_state.instance.input.text
80
+
81
+ annotator_prompt = self.PROMPT_TEMPLATE.replace("{{QUESTION}}", question_text).replace(
82
+ "{{PRED}}", prediction_text
83
+ )
84
+ annotations: Dict[str, Union[Optional[str], Optional[float]]] = {"prompt_text": annotator_prompt}
85
+ for annotator_name, annotator_model_info in self.SHORT_NAME_TO_MODEL_INFO.items():
86
+ annotator_request = Request(
87
+ model=annotator_model_info.model_name,
88
+ model_deployment=annotator_model_info.model_deployment,
89
+ prompt=annotator_prompt,
90
+ temperature=0.0,
91
+ max_tokens=512,
92
+ )
93
+ annotator_response = self._auto_client.make_request(annotator_request)
94
+ if not annotator_response.success:
95
+ raise Exception(f"Annotation request failed: {annotator_response.error}")
96
+ assert len(annotator_response.completions) == 1
97
+ annotator_response_text = annotator_response.completions[0].text
98
+ # fuzzy match regex check, allows for different casing, or forgetting / in end tag
99
+ reasoning_match = re.search(
100
+ r"<\s*reasoning\s*>(.*?)<\/?\s*reasoning\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
101
+ )
102
+ score_match = re.search(
103
+ r"<\s*score\s*>(.*?)<\/?\s*score\s*>", annotator_response_text, re.DOTALL | re.IGNORECASE
104
+ )
105
+ reasoning: Optional[str] = None
106
+ score: Optional[float] = None
107
+ if reasoning_match:
108
+ reasoning = reasoning_match.group(1).strip()
109
+ else:
110
+ hwarn(
111
+ "HelpdeskCallSummarizationAnnotator could not get Reasoning from annotation from "
112
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
113
+ )
114
+
115
+ if score_match:
116
+ try:
117
+ score = float(score_match.group(1).strip())
118
+ except ValueError:
119
+ hwarn(
120
+ "HelpdeskCallSummarizationAnnotator could not parse Score from annotation from "
121
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
122
+ )
123
+ else:
124
+ hwarn(
125
+ "HelpdeskCallSummarizationAnnotator could not get Score from annotation from "
126
+ f"{annotator_model_info.model_name}: {annotator_response_text}"
127
+ )
128
+
129
+ annotations[f"{annotator_name}_reasoning"] = reasoning
130
+ annotations[f"{annotator_name}_score"] = score
131
+ return annotations
@@ -1,4 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
+ from threading import Lock
2
3
  from typing import Any, Dict, List, Tuple, Callable
3
4
 
4
5
  from dacite import from_dict
@@ -17,6 +18,9 @@ except ModuleNotFoundError as e:
17
18
  handle_module_not_found_error(e, suggestions=["images"])
18
19
 
19
20
 
21
+ _compilation_lock = Lock()
22
+
23
+
20
24
  def retry_if_compilation_failed(result: Dict[str, Any]) -> bool:
21
25
  """Retries when the compilation fails."""
22
26
  return "unknown_error" in result
@@ -78,7 +82,8 @@ class ImageCompilerAnnotator(Annotator, ABC):
78
82
  except Exception as e:
79
83
  return {"unknown_error": str(e)}
80
84
 
81
- raw_response = compile()
85
+ with _compilation_lock:
86
+ raw_response = compile()
82
87
  response = {**raw_response}
83
88
  if "media_object" in response:
84
89
  response["media_object"] = from_dict(MediaObject, response["media_object"])
@@ -5,7 +5,7 @@ from typing import Any
5
5
  from helm.common.general import ensure_directory_exists
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.annotation.annotator import Annotator
8
- from helm.benchmark.annotation.score_util import score_with_reasoning
8
+ from helm.benchmark.annotation.model_as_judge import score_with_reasoning
9
9
  from helm.clients.auto_client import AutoClient
10
10
 
11
11
  PROMPT_TEMPLATE = """You are a medical judge will be given a user request and a model response to that request.
@@ -0,0 +1,99 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of medical dialogue summaries.
8
+
9
+ Your goal is to assess how well the summary captures the patient-doctor conversation,
10
+ and how it compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The patient-doctor conversation will be provided in these tags:
13
+ <conversation>
14
+ {{QUESTION}}
15
+ </conversation>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ The reference response will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully review the <response> and compare it to the <gold_response> when needed.
28
+
29
+ Please evaluate the generated summary on a scale of 1-5 (1 = poor, 5 = excellent)
30
+ for each of these three key dimensions:
31
+
32
+ Evaluation Criteria:
33
+ Accuracy (1-5)
34
+ - Does the summary correctly capture the main medical issue and clinical details from the conversation?
35
+
36
+ Completeness (1-5)
37
+ - Does the summary include all important medical information from the conversation?
38
+
39
+ Clarity (1-5)
40
+ - Is the summary easy to understand for clinical use?
41
+
42
+ Output Format:
43
+ Output your evaluation as a single valid JSON object matching the following structure:
44
+ {
45
+ "accuracy": {
46
+ "score": 0,
47
+ "explanation": "Brief explanation of why this score was given."
48
+ },
49
+ "completeness": {
50
+ "score": 0,
51
+ "explanation": "Brief explanation of why this score was given."
52
+ },
53
+ "clarity": {
54
+ "score": 0,
55
+ "explanation": "Brief explanation of why this score was given."
56
+ }
57
+ }
58
+
59
+ Ensure the output is valid JSON:
60
+ - Use **double quotes** (") for all keys and string values.
61
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
62
+ maintain valid JSON formatting.
63
+ - Do not include any additional information in the output.
64
+ """
65
+
66
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
67
+ "accuracy": {"score", "explanation"},
68
+ "completeness": {"score", "explanation"},
69
+ "clarity": {"score", "explanation"},
70
+ }
71
+
72
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
73
+ "gpt": AnnotatorModelInfo(
74
+ model_name="openai/gpt-4o-2024-05-13",
75
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
76
+ ),
77
+ "llama": AnnotatorModelInfo(
78
+ model_name="meta/llama-3.3-70b-instruct",
79
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
80
+ ),
81
+ "claude": AnnotatorModelInfo(
82
+ model_name="anthropic/claude-3-7-sonnet-20250219",
83
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
84
+ ),
85
+ }
86
+
87
+
88
+ class MedDialogAnnotator(LLMAsJuryAnnotator):
89
+ """The MedDialog autograder."""
90
+
91
+ name = "med_dialog"
92
+
93
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
94
+ super().__init__(
95
+ auto_client=auto_client,
96
+ prompt_template=PROMPT_TEMPLATE,
97
+ annotation_criteria=ANNOTATION_CRITERIA,
98
+ annotator_models=ANNOTATOR_MODELS,
99
+ )