crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show
  1. crfm_helm-0.5.5.dist-info/METADATA +413 -0
  2. crfm_helm-0.5.5.dist-info/RECORD +894 -0
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
  4. helm/benchmark/adaptation/adapter_spec.py +13 -1
  5. helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
  6. helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
  7. helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
  8. helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
  9. helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
  10. helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
  11. helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
  12. helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
  13. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
  14. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
  15. helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
  16. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
  17. helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
  18. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
  19. helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
  20. helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
  21. helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
  22. helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
  23. helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
  24. helm/benchmark/adaptation/common_adapter_specs.py +69 -4
  25. helm/benchmark/adaptation/prompt.py +1 -1
  26. helm/benchmark/annotation/aci_bench_annotator.py +95 -0
  27. helm/benchmark/annotation/air_bench_annotator.py +20 -5
  28. helm/benchmark/annotation/annotator.py +5 -0
  29. helm/benchmark/annotation/annotator_factory.py +3 -20
  30. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  31. helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
  32. helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
  33. helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
  34. helm/benchmark/annotation/bird_sql_annotator.py +58 -0
  35. helm/benchmark/annotation/call_center_annotator.py +22 -11
  36. helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
  37. helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
  38. helm/benchmark/annotation/dischargeme_annotator.py +107 -0
  39. helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
  40. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  41. helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
  42. helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
  43. helm/benchmark/annotation/live_qa_annotator.py +10 -5
  44. helm/benchmark/annotation/med_dialog_annotator.py +99 -0
  45. helm/benchmark/annotation/medalign_annotator.py +100 -0
  46. helm/benchmark/annotation/medi_qa_annotator.py +98 -0
  47. helm/benchmark/annotation/medication_qa_annotator.py +90 -61
  48. helm/benchmark/annotation/mental_health_annotator.py +98 -0
  49. helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
  50. helm/benchmark/annotation/model_as_judge.py +281 -18
  51. helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
  52. helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
  53. helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
  54. helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
  55. helm/benchmark/annotation/omni_math_annotator.py +132 -0
  56. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  57. helm/benchmark/annotation/spider_annotator.py +18 -0
  58. helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
  59. helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
  60. helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
  61. helm/benchmark/annotation/wildbench_annotator.py +119 -0
  62. helm/benchmark/annotation/xstest_annotator.py +20 -30
  63. helm/benchmark/annotation_executor.py +35 -15
  64. helm/benchmark/augmentations/cleva_perturbation.py +9 -8
  65. helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
  66. helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
  67. helm/benchmark/augmentations/dialect_perturbation.py +4 -5
  68. helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
  69. helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
  70. helm/benchmark/augmentations/gender_perturbation.py +2 -2
  71. helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
  72. helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
  73. helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
  74. helm/benchmark/augmentations/person_name_perturbation.py +4 -5
  75. helm/benchmark/augmentations/perturbation.py +1 -1
  76. helm/benchmark/augmentations/space_perturbation.py +2 -2
  77. helm/benchmark/augmentations/suffix_perturbation.py +2 -2
  78. helm/benchmark/augmentations/synonym_perturbation.py +4 -3
  79. helm/benchmark/augmentations/test_perturbation.py +16 -13
  80. helm/benchmark/augmentations/translate_perturbation.py +2 -2
  81. helm/benchmark/augmentations/typos_perturbation.py +2 -2
  82. helm/benchmark/data_preprocessor.py +2 -2
  83. helm/benchmark/huggingface_registration.py +2 -7
  84. helm/benchmark/metrics/aci_bench_metrics.py +34 -0
  85. helm/benchmark/metrics/basic_metrics.py +6 -6
  86. helm/benchmark/metrics/bbq_metrics.py +2 -2
  87. helm/benchmark/metrics/bias_metrics.py +12 -3
  88. helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
  89. helm/benchmark/metrics/bird_sql_metrics.py +28 -0
  90. helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
  91. helm/benchmark/metrics/classification_metrics.py +76 -12
  92. helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
  93. helm/benchmark/metrics/code_metrics.py +5 -5
  94. helm/benchmark/metrics/comet_metric.py +125 -0
  95. helm/benchmark/metrics/common_metric_specs.py +9 -2
  96. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
  97. helm/benchmark/metrics/copyright_metrics.py +4 -4
  98. helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
  99. helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
  100. helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
  101. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
  102. helm/benchmark/metrics/dischargeme_metrics.py +34 -0
  103. helm/benchmark/metrics/disinformation_metrics.py +4 -4
  104. helm/benchmark/metrics/dry_run_metrics.py +5 -5
  105. helm/benchmark/metrics/efficiency_metrics.py +3 -3
  106. helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
  107. helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
  108. helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
  109. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
  110. helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
  111. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
  112. helm/benchmark/metrics/ifeval/instructions.py +1574 -0
  113. helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
  114. helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
  115. helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
  116. helm/benchmark/metrics/ifeval_metrics.py +55 -0
  117. helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
  118. helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
  119. helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
  120. helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
  121. helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
  122. helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
  123. helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
  124. helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
  125. helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
  126. helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
  127. helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
  128. helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
  129. helm/benchmark/metrics/language_modeling_metrics.py +4 -4
  130. helm/benchmark/metrics/machine_translation_metrics.py +2 -2
  131. helm/benchmark/metrics/med_dialog_metrics.py +34 -0
  132. helm/benchmark/metrics/medalign_metrics.py +34 -0
  133. helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
  134. helm/benchmark/metrics/medec_metrics.py +101 -0
  135. helm/benchmark/metrics/medi_qa_metrics.py +34 -0
  136. helm/benchmark/metrics/medication_qa_metrics.py +15 -4
  137. helm/benchmark/metrics/mental_health_metrics.py +34 -0
  138. helm/benchmark/metrics/metric.py +3 -3
  139. helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
  140. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
  141. helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
  142. helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
  143. helm/benchmark/metrics/nltk_helper.py +32 -0
  144. helm/benchmark/metrics/numeracy_metrics.py +4 -4
  145. helm/benchmark/metrics/omni_math_metrics.py +32 -0
  146. helm/benchmark/metrics/output_processing_metric.py +60 -0
  147. helm/benchmark/metrics/output_processors.py +15 -0
  148. helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
  149. helm/benchmark/metrics/ranking_metrics.py +3 -3
  150. helm/benchmark/metrics/reference_metric.py +3 -3
  151. helm/benchmark/metrics/safety_metrics.py +39 -17
  152. helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
  153. helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
  154. helm/benchmark/metrics/spider_metrics.py +7 -0
  155. helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
  156. helm/benchmark/metrics/statistic.py +1 -1
  157. helm/benchmark/metrics/summac/model_summac.py +1 -1
  158. helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
  159. helm/benchmark/metrics/summarization_metrics.py +19 -9
  160. helm/benchmark/metrics/test_bias_metrics.py +5 -1
  161. helm/benchmark/metrics/test_classification_metrics.py +140 -68
  162. helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
  163. helm/benchmark/metrics/test_metric.py +1 -1
  164. helm/benchmark/metrics/test_statistic.py +2 -2
  165. helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
  166. helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
  167. helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
  168. helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
  169. helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
  170. helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
  171. helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
  172. helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
  173. helm/benchmark/metrics/toxicity_metrics.py +4 -4
  174. helm/benchmark/metrics/unitxt_metrics.py +21 -4
  175. helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
  176. helm/benchmark/metrics/wildbench_metrics.py +34 -0
  177. helm/benchmark/model_metadata_registry.py +16 -0
  178. helm/benchmark/presentation/create_plots.py +1 -1
  179. helm/benchmark/presentation/schema.py +3 -0
  180. helm/benchmark/presentation/summarize.py +119 -256
  181. helm/benchmark/presentation/test_summarize.py +145 -3
  182. helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
  183. helm/benchmark/reeval_run.py +203 -0
  184. helm/benchmark/reeval_runner.py +355 -0
  185. helm/benchmark/run.py +8 -17
  186. helm/benchmark/run_expander.py +105 -8
  187. helm/benchmark/run_spec_factory.py +12 -0
  188. helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
  189. helm/benchmark/run_specs/audio_run_specs.py +613 -0
  190. helm/benchmark/run_specs/call_center_run_specs.py +49 -0
  191. helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
  192. helm/benchmark/run_specs/classic_run_specs.py +1 -69
  193. helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
  194. helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
  195. helm/benchmark/run_specs/experimental_run_specs.py +112 -3
  196. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  197. helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
  198. helm/benchmark/run_specs/lite_run_specs.py +2 -2
  199. helm/benchmark/run_specs/long_context_run_specs.py +89 -0
  200. helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
  201. helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
  202. helm/benchmark/run_specs/oab_exams_specs.py +32 -0
  203. helm/benchmark/run_specs/safety_run_specs.py +37 -0
  204. helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
  205. helm/benchmark/run_specs/sql_run_specs.py +54 -0
  206. helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
  207. helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
  208. helm/benchmark/run_specs/vlm_run_specs.py +83 -5
  209. helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
  210. helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
  211. helm/benchmark/scenarios/air_bench_scenario.py +6 -1
  212. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
  213. helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
  214. helm/benchmark/scenarios/audio_language/__init__.py +0 -0
  215. helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
  216. helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
  217. helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
  218. helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
  219. helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
  220. helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
  221. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
  222. helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
  223. helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
  224. helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
  225. helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
  226. helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
  227. helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
  228. helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
  229. helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
  230. helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
  231. helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
  232. helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
  233. helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
  234. helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
  235. helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
  236. helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
  237. helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
  238. helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
  239. helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
  240. helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
  241. helm/benchmark/scenarios/banking77_scenario.py +6 -1
  242. helm/benchmark/scenarios/bbq_scenario.py +1 -1
  243. helm/benchmark/scenarios/big_bench_scenario.py +11 -1
  244. helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
  245. helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
  246. helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
  247. helm/benchmark/scenarios/blimp_scenario.py +1 -1
  248. helm/benchmark/scenarios/bold_scenario.py +1 -1
  249. helm/benchmark/scenarios/boolq_scenario.py +1 -1
  250. helm/benchmark/scenarios/casehold_scenario.py +79 -0
  251. helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
  252. helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
  253. helm/benchmark/scenarios/clear_scenario.py +153 -0
  254. helm/benchmark/scenarios/cleva_scenario.py +2 -2
  255. helm/benchmark/scenarios/code_scenario.py +17 -4
  256. helm/benchmark/scenarios/commonsense_scenario.py +1 -1
  257. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
  258. helm/benchmark/scenarios/copyright_scenario.py +1 -1
  259. helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
  260. helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
  261. helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
  262. helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
  263. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
  264. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
  265. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
  266. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
  267. helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
  268. helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
  269. helm/benchmark/scenarios/disinformation_scenario.py +10 -1
  270. helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
  271. helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
  272. helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
  273. helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
  274. helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
  275. helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
  276. helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
  277. helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
  278. helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
  279. helm/benchmark/scenarios/gpqa_scenario.py +80 -0
  280. helm/benchmark/scenarios/grammar_scenario.py +2 -2
  281. helm/benchmark/scenarios/gsm_scenario.py +10 -1
  282. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
  283. helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
  284. helm/benchmark/scenarios/headqa_scenario.py +131 -0
  285. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
  286. helm/benchmark/scenarios/ice_scenario.py +8 -4
  287. helm/benchmark/scenarios/ifeval_scenario.py +53 -0
  288. helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
  289. helm/benchmark/scenarios/imdb_scenario.py +11 -2
  290. helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
  291. helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
  292. helm/benchmark/scenarios/koala_scenario.py +1 -1
  293. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
  294. helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
  295. helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
  296. helm/benchmark/scenarios/legal_support_scenario.py +11 -1
  297. helm/benchmark/scenarios/legalbench_scenario.py +22 -3
  298. helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
  299. helm/benchmark/scenarios/lextreme_scenario.py +11 -1
  300. helm/benchmark/scenarios/live_qa_scenario.py +1 -1
  301. helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
  302. helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
  303. helm/benchmark/scenarios/math_scenario.py +9 -1
  304. helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
  305. helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
  306. helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
  307. helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
  308. helm/benchmark/scenarios/med_qa_scenario.py +10 -1
  309. helm/benchmark/scenarios/medalign_scenario.py +88 -0
  310. helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
  311. helm/benchmark/scenarios/medbullets_scenario.py +140 -0
  312. helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
  313. helm/benchmark/scenarios/medec_scenario.py +120 -0
  314. helm/benchmark/scenarios/medhallu_scenario.py +66 -0
  315. helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
  316. helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
  317. helm/benchmark/scenarios/mental_health_scenario.py +112 -0
  318. helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
  319. helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
  320. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
  321. helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
  322. helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
  323. helm/benchmark/scenarios/mmlu_scenario.py +11 -1
  324. helm/benchmark/scenarios/msmarco_scenario.py +1 -1
  325. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
  326. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
  327. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
  328. helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
  329. helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
  330. helm/benchmark/scenarios/newsqa_scenario.py +1 -1
  331. helm/benchmark/scenarios/numeracy_scenario.py +10 -1
  332. helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
  333. helm/benchmark/scenarios/omni_math_scenario.py +53 -0
  334. helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
  335. helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
  336. helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
  337. helm/benchmark/scenarios/quac_scenario.py +10 -1
  338. helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
  339. helm/benchmark/scenarios/raft_scenario.py +18 -3
  340. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
  341. helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
  342. helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
  343. helm/benchmark/scenarios/scenario.py +9 -1
  344. helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
  345. helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
  346. helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
  347. helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
  348. helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
  349. helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
  350. helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
  351. helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
  352. helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
  353. helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
  354. helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
  355. helm/benchmark/scenarios/spider_scenario.py +91 -0
  356. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
  357. helm/benchmark/scenarios/summarization_scenario.py +11 -1
  358. helm/benchmark/scenarios/sumosum_scenario.py +157 -0
  359. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
  360. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
  361. helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
  362. helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
  363. helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
  364. helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
  365. helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
  366. helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
  367. helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
  368. helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
  369. helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
  370. helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
  371. helm/benchmark/scenarios/test_math_scenario.py +1 -0
  372. helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
  373. helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
  374. helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
  375. helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
  376. helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
  377. helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
  378. helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
  379. helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
  380. helm/benchmark/scenarios/the_pile_scenario.py +1 -1
  381. helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
  382. helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
  383. helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
  384. helm/benchmark/scenarios/unitxt_scenario.py +8 -2
  385. helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
  386. helm/benchmark/scenarios/vicuna_scenario.py +1 -1
  387. helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
  388. helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
  389. helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
  390. helm/benchmark/scenarios/wikifact_scenario.py +11 -1
  391. helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
  392. helm/benchmark/scenarios/wildbench_scenario.py +83 -0
  393. helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
  394. helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
  395. helm/benchmark/scenarios/xstest_scenario.py +1 -1
  396. helm/benchmark/server.py +11 -0
  397. helm/benchmark/slurm_runner.py +1 -1
  398. helm/benchmark/static/schema_audio.yaml +752 -0
  399. helm/benchmark/static/schema_autobencher.yaml +150 -0
  400. helm/benchmark/static/schema_call_center.yaml +97 -60
  401. helm/benchmark/static/schema_capabilities.yaml +254 -0
  402. helm/benchmark/static/schema_czech_bank.yaml +148 -0
  403. helm/benchmark/static/schema_enem_challenge.yaml +146 -0
  404. helm/benchmark/static/schema_enterprise.yaml +298 -0
  405. helm/benchmark/static/schema_finance.yaml +14 -12
  406. helm/benchmark/static/schema_heim.yaml +1389 -0
  407. helm/benchmark/static/schema_legal.yaml +566 -0
  408. helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
  409. helm/benchmark/static/schema_medhelm.yaml +1081 -0
  410. helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
  411. helm/benchmark/static/schema_safety.yaml +42 -6
  412. helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
  413. helm/benchmark/static/schema_social_audio.yaml +224 -0
  414. helm/benchmark/static/schema_sql.yaml +171 -0
  415. helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
  416. helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
  417. helm/benchmark/static/schema_vhelm.yaml +151 -47
  418. helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
  419. helm/benchmark/static_build/assets/index-262903c1.js +10 -0
  420. helm/benchmark/static_build/assets/index-42060d71.css +1 -0
  421. helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
  422. helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
  423. helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
  424. helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
  425. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  426. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  427. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  428. helm/benchmark/static_build/config.js +1 -1
  429. helm/benchmark/static_build/index.html +5 -5
  430. helm/benchmark/window_services/default_window_service.py +1 -1
  431. helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
  432. helm/benchmark/window_services/ice_window_service.py +1 -1
  433. helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
  434. helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
  435. helm/benchmark/window_services/local_window_service.py +2 -2
  436. helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
  437. helm/benchmark/window_services/test_bloom_window_service.py +3 -3
  438. helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
  439. helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
  440. helm/benchmark/window_services/test_gptj_window_service.py +8 -3
  441. helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
  442. helm/benchmark/window_services/test_openai_window_service.py +8 -3
  443. helm/benchmark/window_services/test_opt_window_service.py +3 -3
  444. helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
  445. helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
  446. helm/benchmark/window_services/test_t511b_window_service.py +3 -3
  447. helm/benchmark/window_services/test_ul2_window_service.py +3 -3
  448. helm/benchmark/window_services/test_utils.py +1 -1
  449. helm/benchmark/window_services/test_yalm_window_service.py +3 -3
  450. helm/benchmark/window_services/tokenizer_service.py +0 -5
  451. helm/benchmark/window_services/yalm_window_service.py +1 -1
  452. helm/clients/ai21_client.py +3 -3
  453. helm/clients/aleph_alpha_client.py +1 -1
  454. helm/clients/audio_language/__init__.py +0 -0
  455. helm/clients/audio_language/diva_llama_client.py +118 -0
  456. helm/clients/audio_language/llama_omni_client.py +198 -0
  457. helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
  458. helm/clients/audio_language/qwen_audiolm_client.py +150 -0
  459. helm/clients/auto_client.py +4 -2
  460. helm/clients/azure_openai_client.py +55 -0
  461. helm/clients/bedrock_client.py +201 -7
  462. helm/clients/bedrock_utils.py +33 -0
  463. helm/clients/clip_scorers/clip_scorer.py +1 -1
  464. helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
  465. helm/clients/cohere_client.py +3 -3
  466. helm/clients/google_client.py +1 -1
  467. helm/clients/http_model_client.py +1 -1
  468. helm/clients/huggingface_client.py +10 -18
  469. helm/clients/ibm_client.py +267 -0
  470. helm/clients/image_generation/adobe_vision_client.py +1 -1
  471. helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
  472. helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
  473. helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
  474. helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
  475. helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
  476. helm/clients/image_generation/cogview2_client.py +1 -1
  477. helm/clients/image_generation/dalle2_client.py +1 -1
  478. helm/clients/image_generation/dalle3_client.py +2 -2
  479. helm/clients/image_generation/dalle_mini/__init__.py +1 -1
  480. helm/clients/image_generation/dalle_mini/data.py +1 -1
  481. helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
  482. helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
  483. helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
  484. helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
  485. helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
  486. helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
  487. helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
  488. helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
  489. helm/clients/image_generation/dalle_mini_client.py +1 -1
  490. helm/clients/image_generation/deep_floyd_client.py +1 -1
  491. helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
  492. helm/clients/image_generation/lexica_client.py +1 -1
  493. helm/clients/image_generation/mindalle/models/__init__.py +6 -6
  494. helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
  495. helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
  496. helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
  497. helm/clients/image_generation/mindalle_client.py +1 -1
  498. helm/clients/image_generation/together_image_generation_client.py +1 -1
  499. helm/clients/lit_gpt_client.py +2 -2
  500. helm/clients/mistral_client.py +62 -18
  501. helm/clients/nvidia_nim_client.py +0 -3
  502. helm/clients/openai_client.py +255 -21
  503. helm/clients/palmyra_client.py +2 -6
  504. helm/clients/reka_client.py +1 -1
  505. helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
  506. helm/clients/stanfordhealthcare_claude_client.py +31 -0
  507. helm/clients/stanfordhealthcare_google_client.py +43 -0
  508. helm/clients/stanfordhealthcare_http_model_client.py +93 -0
  509. helm/clients/stanfordhealthcare_openai_client.py +62 -0
  510. helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
  511. helm/clients/test_client.py +1 -1
  512. helm/clients/test_together_client.py +6 -1
  513. helm/clients/together_client.py +69 -7
  514. helm/clients/upstage_client.py +23 -0
  515. helm/clients/vertexai_client.py +39 -13
  516. helm/clients/vision_language/open_flamingo/__init__.py +2 -2
  517. helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
  518. helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
  519. helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
  520. helm/clients/vision_language/qwen2_vlm_client.py +175 -0
  521. helm/clients/vllm_client.py +4 -6
  522. helm/clients/yi_client.py +0 -3
  523. helm/common/audio_utils.py +111 -0
  524. helm/common/cache.py +8 -30
  525. helm/common/file_caches/local_file_cache.py +1 -1
  526. helm/common/file_caches/test_local_file_cache.py +1 -1
  527. helm/common/images_utils.py +2 -2
  528. helm/common/key_value_store.py +9 -9
  529. helm/common/media_object.py +2 -2
  530. helm/common/mongo_key_value_store.py +3 -3
  531. helm/common/multimodal_request_utils.py +26 -0
  532. helm/common/reeval_parameters.py +12 -0
  533. helm/common/request.py +6 -2
  534. helm/common/response_format.py +18 -0
  535. helm/common/test_cache.py +1 -48
  536. helm/common/test_media_object.py +1 -1
  537. helm/common/tokenization_request.py +0 -9
  538. helm/config/model_deployments.yaml +1258 -33
  539. helm/config/model_metadata.yaml +1110 -41
  540. helm/config/tokenizer_configs.yaml +403 -3
  541. helm/proxy/cli.py +2 -2
  542. helm/proxy/example_queries.py +1 -1
  543. helm/proxy/server.py +11 -13
  544. helm/proxy/services/remote_service.py +1 -7
  545. helm/proxy/services/server_service.py +6 -19
  546. helm/proxy/services/service.py +0 -6
  547. helm/proxy/services/test_remote_service.py +2 -2
  548. helm/proxy/services/test_service.py +1 -1
  549. helm/proxy/static/general.js +122 -0
  550. helm/proxy/static/help.html +99 -0
  551. helm/proxy/static/index.css +57 -0
  552. helm/proxy/static/index.html +40 -0
  553. helm/proxy/static/index.js +456 -0
  554. helm/proxy/static/info-icon.png +0 -0
  555. helm/proxy/test_retry.py +1 -1
  556. helm/proxy/token_counters/auto_token_counter.py +1 -1
  557. helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
  558. helm/tokenizers/caching_tokenizer.py +2 -30
  559. helm/tokenizers/http_model_tokenizer.py +1 -1
  560. helm/tokenizers/huggingface_tokenizer.py +2 -2
  561. helm/tokenizers/lit_gpt_tokenizer.py +1 -1
  562. helm/tokenizers/test_anthropic_tokenizer.py +6 -2
  563. helm/tokenizers/test_huggingface_tokenizer.py +1 -1
  564. helm/tokenizers/test_yalm_tokenizer.py +1 -1
  565. helm/tokenizers/tiktoken_tokenizer.py +1 -1
  566. helm/tokenizers/tokenizer.py +3 -1
  567. helm/tokenizers/yalm_tokenizer.py +3 -3
  568. helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
  569. crfm_helm-0.5.3.dist-info/METADATA +0 -355
  570. crfm_helm-0.5.3.dist-info/RECORD +0 -699
  571. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  572. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  573. helm/benchmark/data_overlap/light_scenario.py +0 -60
  574. helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
  575. helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
  576. helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
  577. helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
  578. helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
  579. helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
  580. helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
  581. helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
  582. helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
  583. helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
  584. helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
  585. helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
  586. helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
  587. helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
  588. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  589. helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
  590. helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
  591. helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
  592. helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
  593. helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
  594. helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
  595. helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
  596. helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
  597. helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
  598. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  599. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  600. helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
  601. helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
  602. helm/tokenizers/anthropic_tokenizer.py +0 -52
  603. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
  604. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
  605. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
  606. /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
@@ -5,7 +5,7 @@ from typing import Any
5
5
  from helm.common.general import ensure_directory_exists
6
6
  from helm.benchmark.adaptation.request_state import RequestState
7
7
  from helm.benchmark.annotation.annotator import Annotator
8
- from helm.benchmark.annotation.score_util import score_with_reasoning
8
+ from helm.benchmark.annotation.model_as_judge import score_with_reasoning
9
9
  from helm.clients.auto_client import AutoClient
10
10
 
11
11
  PROMPT_TEMPLATE = """You are a medical judge will be given a user request and a model response to that request.
@@ -28,11 +28,16 @@ Determine if the model is correct with respect to one or more of the <correct_re
28
28
  # 0.7, if the model response contains correct information but is incomplete relative to the correct response(s)
29
29
  # 0.3, if the model response contains some correct information but is incomplete and/or incorrect
30
30
  # 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
31
- Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag.
31
+ Please output your one-sentence concise reasoning within the "reasoning" tags and your score within the "score" tags.
32
32
  Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
33
- Format:
34
- ##short_reasoning: <your concise and brief reasoning>
35
- ##the_score: <your score>"""
33
+ Only output a tag-delimited object with the following format:
34
+ <reasoning>
35
+ INSERT_YOUR_REASONING_HERE
36
+ </reasoning>
37
+ <score>
38
+ INSERT_YOUR_SCORE_HERE
39
+ </score>
40
+ """
36
41
 
37
42
 
38
43
  class LiveQAAnnotator(Annotator):
@@ -0,0 +1,99 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of medical dialogue summaries.
8
+
9
+ Your goal is to assess how well the summary captures the patient-doctor conversation,
10
+ and how it compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The patient-doctor conversation will be provided in these tags:
13
+ <conversation>
14
+ {{QUESTION}}
15
+ </conversation>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ The reference response will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully review the <response> and compare it to the <gold_response> when needed.
28
+
29
+ Please evaluate the generated summary on a scale of 1-5 (1 = poor, 5 = excellent)
30
+ for each of these three key dimensions:
31
+
32
+ Evaluation Criteria:
33
+ Accuracy (1-5)
34
+ - Does the summary correctly capture the main medical issue and clinical details from the conversation?
35
+
36
+ Completeness (1-5)
37
+ - Does the summary include all important medical information from the conversation?
38
+
39
+ Clarity (1-5)
40
+ - Is the summary easy to understand for clinical use?
41
+
42
+ Output Format:
43
+ Output your evaluation as a single valid JSON object matching the following structure:
44
+ {
45
+ "accuracy": {
46
+ "score": 0,
47
+ "explanation": "Brief explanation of why this score was given."
48
+ },
49
+ "completeness": {
50
+ "score": 0,
51
+ "explanation": "Brief explanation of why this score was given."
52
+ },
53
+ "clarity": {
54
+ "score": 0,
55
+ "explanation": "Brief explanation of why this score was given."
56
+ }
57
+ }
58
+
59
+ Ensure the output is valid JSON:
60
+ - Use **double quotes** (") for all keys and string values.
61
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
62
+ maintain valid JSON formatting.
63
+ - Do not include any additional information in the output.
64
+ """
65
+
66
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
67
+ "accuracy": {"score", "explanation"},
68
+ "completeness": {"score", "explanation"},
69
+ "clarity": {"score", "explanation"},
70
+ }
71
+
72
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
73
+ "gpt": AnnotatorModelInfo(
74
+ model_name="openai/gpt-4o-2024-05-13",
75
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
76
+ ),
77
+ "llama": AnnotatorModelInfo(
78
+ model_name="meta/llama-3.3-70b-instruct",
79
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
80
+ ),
81
+ "claude": AnnotatorModelInfo(
82
+ model_name="anthropic/claude-3-7-sonnet-20250219",
83
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
84
+ ),
85
+ }
86
+
87
+
88
+ class MedDialogAnnotator(LLMAsJuryAnnotator):
89
+ """The MedDialog autograder."""
90
+
91
+ name = "med_dialog"
92
+
93
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
94
+ super().__init__(
95
+ auto_client=auto_client,
96
+ prompt_template=PROMPT_TEMPLATE,
97
+ annotation_criteria=ANNOTATION_CRITERIA,
98
+ annotator_models=ANNOTATOR_MODELS,
99
+ )
@@ -0,0 +1,100 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert responsible for evaluating the quality of a
8
+ response to a clinical instruction, given an associated Electronic Health Record (EHR).
9
+
10
+ Your goal is to assess how well the response interprets the EHR, follows clinical best practices,
11
+ and aligns with the gold response in terms of accuracy, completeness, and clarity.
12
+
13
+ The instruction and EHR pair will be provided in these tags:
14
+ <user_request>
15
+ {{QUESTION}}
16
+ </user_request>
17
+
18
+ The response will be provided in these tags:
19
+ <response>
20
+ {{RESPONSE}}
21
+ </response>
22
+
23
+ The gold response (reference answer) will be provided in these tags:
24
+ <gold_response>
25
+ {{GOLD_RESPONSE}}
26
+ </gold_response>
27
+
28
+ Carefully review the <response> based on the <user_request> and compare it to
29
+ the <gold_response> when needed.
30
+ For each category below, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent)
31
+ and provide a brief explanation for your score:
32
+
33
+ Evaluation Criteria:
34
+ Accuracy (1-5)
35
+ - Does the response provide correct medical information based on the EHR and clinical guidelines?
36
+
37
+ Completeness (1-5)
38
+ - Does the response include all important patient information and address the clinical question?
39
+
40
+ Clarity (1-5)
41
+ - Is the response written clearly so clinicians can easily understand it?
42
+
43
+ Output Format:
44
+ Generate a valid JSON object with your evaluation:
45
+ {
46
+ "accuracy": {
47
+ "score": 0,
48
+ "explanation": "Explain why this score was given."
49
+ },
50
+ "completeness": {
51
+ "score": 0,
52
+ "explanation": "Explain why this score was given."
53
+ },
54
+ "clarity": {
55
+ "score": 0,
56
+ "explanation": "Explain why this score was given."
57
+ }
58
+ }
59
+
60
+ Ensure the output is valid JSON:
61
+ - Use **double quotes** (") for all keys and string values.
62
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
63
+ maintain valid JSON formatting.
64
+ - Do not include any additional information in the output.
65
+ """
66
+
67
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
68
+ "accuracy": {"score", "explanation"},
69
+ "completeness": {"score", "explanation"},
70
+ "clarity": {"score", "explanation"},
71
+ }
72
+
73
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
74
+ "gpt": AnnotatorModelInfo(
75
+ model_name="openai/gpt-4o-2024-05-13",
76
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
77
+ ),
78
+ "llama": AnnotatorModelInfo(
79
+ model_name="meta/llama-3.3-70b-instruct",
80
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
81
+ ),
82
+ "claude": AnnotatorModelInfo(
83
+ model_name="anthropic/claude-3-7-sonnet-20250219",
84
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
85
+ ),
86
+ }
87
+
88
+
89
+ class MedalignAnnotator(LLMAsJuryAnnotator):
90
+ """The Medalign autograder."""
91
+
92
+ name = "medalign"
93
+
94
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
95
+ super().__init__(
96
+ auto_client=auto_client,
97
+ prompt_template=PROMPT_TEMPLATE,
98
+ annotation_criteria=ANNOTATION_CRITERIA,
99
+ annotator_models=ANNOTATOR_MODELS,
100
+ )
@@ -0,0 +1,98 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
8
+ response to a consumer health question.
9
+ Your goal is to assess how well the generated response answers the medical question,
10
+ and how it compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The question will be provided in these tags:
13
+ <question>
14
+ {{QUESTION}}
15
+ </question>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ The reference answer will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully analyze the <response> compared to the <gold_response> and the original <question>.
28
+ For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
29
+ and provide a short justification for your score.
30
+
31
+ Evaluation Criteria:
32
+ Accuracy (1-5)
33
+ - Is the medical information correct?
34
+
35
+ Completeness (1-5)
36
+ - Does the response fully answer the question and include important medical details?
37
+
38
+ Clarity (1-5)
39
+ - Is the response easy for consumers to understand?
40
+
41
+ Output Format:
42
+ Output the evaluation as a single valid JSON object matching the following structure:
43
+ {
44
+ "accuracy": {
45
+ "score": 0,
46
+ "explanation": "Explain why this score was given."
47
+ },
48
+ "completeness": {
49
+ "score": 0,
50
+ "explanation": "Explain why this score was given."
51
+ },
52
+ "clarity": {
53
+ "score": 0,
54
+ "explanation": "Explain why this score was given."
55
+ }
56
+ }
57
+
58
+ Ensure the output is valid JSON:
59
+ - Use **double quotes** (") for all keys and string values.
60
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
61
+ maintain valid JSON formatting.
62
+ - Do not include any additional information in the output.
63
+ """
64
+
65
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
66
+ "accuracy": {"score", "explanation"},
67
+ "completeness": {"score", "explanation"},
68
+ "clarity": {"score", "explanation"},
69
+ }
70
+
71
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
72
+ "gpt": AnnotatorModelInfo(
73
+ model_name="openai/gpt-4o-2024-05-13",
74
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
75
+ ),
76
+ "llama": AnnotatorModelInfo(
77
+ model_name="meta/llama-3.3-70b-instruct",
78
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
79
+ ),
80
+ "claude": AnnotatorModelInfo(
81
+ model_name="anthropic/claude-3-7-sonnet-20250219",
82
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
83
+ ),
84
+ }
85
+
86
+
87
+ class MediQAAnnotator(LLMAsJuryAnnotator):
88
+ """The MediQA autograder."""
89
+
90
+ name = "medi_qa"
91
+
92
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
93
+ super().__init__(
94
+ auto_client=auto_client,
95
+ prompt_template=PROMPT_TEMPLATE,
96
+ annotation_criteria=ANNOTATION_CRITERIA,
97
+ annotator_models=ANNOTATOR_MODELS,
98
+ )
@@ -1,68 +1,97 @@
1
- import os
2
- import re
3
- from typing import Any
4
-
5
- from helm.common.general import ensure_directory_exists
6
- from helm.benchmark.adaptation.request_state import RequestState
7
- from helm.benchmark.annotation.annotator import Annotator
8
- from helm.benchmark.annotation.model_as_judge import score_with_reasoning
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
9
4
  from helm.clients.auto_client import AutoClient
10
5
 
11
- PROMPT_TEMPLATE = """You are a medical judge will be given a user request and a model response to that request.
12
- Your task is to evaluate whether the model response properly answers the request based on a set of correct answers.
13
- The user's request will be provided in these tags:
14
- <user_request>
6
+
7
+ PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating responses to consumer medication questions.
8
+
9
+ Your goal is to assess how well the response captures the information asked,
10
+ and how it compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The question provided in these tags:
13
+ <medication_question>
15
14
  {{QUESTION}}
16
- </user_request>
17
- The model's response will be provided in these tags:
18
- <model_response>
19
- {{ANSWER}}
20
- </model_response>
21
- Some potential correct responses will be provided in these tags:
22
- <correct_response>
23
- {{CORRECT_RESPONSE}}
24
- </correct_response>
25
- Carefully analyze the <model_response>.
26
- Determine if the model is correct with respect to one or more of the <correct_responses> on a range from 0 to 1.
27
- # 1, if the model response fully matches the correct response content and intent for at least one correct response
28
- # 0.5, if the model response contains some correct information but is incomplete and/or incorrect
29
- # 0, if the model response is completely unrelated or incorrect relative to the correct response(s)
30
- Output your one-sentence concise reasoning in the ##short_reasoning tag and your score in the ##the_score tag.
31
- Your reasoning should be less than 20 tokens. The score should be a single number with no other output.
32
- Format:
33
- ##short_reasoning: <your concise and brief reasoning>
34
- ##the_score: <your score>"""
35
-
36
-
37
- class MedicationQAAnnotator(Annotator):
38
- """The LiveQA autograder."""
15
+ </medication_question>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ The reference response will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully review the <response> and compare it to the <gold_response> when needed.
28
+ Please evaluate the response on a scale of 1-5 (1 = poor, 5 = excellent) for each of these three key dimensions:
29
+
30
+ Evaluation Criteria:
31
+ Accuracy (1-5)
32
+ - Is the information about the medication factually correct?
33
+
34
+ Completeness (1-5)
35
+ - Does the response fully answer the question, covering all important points?
36
+
37
+ Clarity (1-5)
38
+ - Is the response easy for a consumer to understand?
39
+
40
+ Output Format:
41
+ Output your evaluation as a single valid JSON object matching the following structure:
42
+ {
43
+ "accuracy": {
44
+ "score": 0,
45
+ "explanation": "Brief explanation of why this score was given."
46
+ },
47
+ "completeness": {
48
+ "score": 0,
49
+ "explanation": "Brief explanation of why this score was given."
50
+ },
51
+ "clarity": {
52
+ "score": 0,
53
+ "explanation": "Brief explanation of why this score was given."
54
+ }
55
+ }
56
+
57
+ Ensure the output is valid JSON:
58
+ - Use **double quotes** (") for all keys and string values.
59
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
60
+ maintain valid JSON formatting.
61
+ - Do not include any additional information in the output.
62
+ """
63
+
64
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
65
+ "accuracy": {"score", "explanation"},
66
+ "completeness": {"score", "explanation"},
67
+ "clarity": {"score", "explanation"},
68
+ }
69
+
70
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
71
+ "gpt": AnnotatorModelInfo(
72
+ model_name="openai/gpt-4o-2024-05-13",
73
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
74
+ ),
75
+ "llama": AnnotatorModelInfo(
76
+ model_name="meta/llama-3.3-70b-instruct",
77
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
78
+ ),
79
+ "claude": AnnotatorModelInfo(
80
+ model_name="anthropic/claude-3-7-sonnet-20250219",
81
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
82
+ ),
83
+ }
84
+
85
+
86
+ class MedicationQAAnnotator(LLMAsJuryAnnotator):
87
+ """The MedicationQA autograder."""
39
88
 
40
89
  name = "medication_qa"
41
90
 
42
- def __init__(self, auto_client: AutoClient, file_storage_path: str):
43
- self._auto_client = auto_client
44
- cache_dir = os.path.join(file_storage_path, "data")
45
- ensure_directory_exists(cache_dir)
46
- # Regex pattern is lenient to allow for typos e.g. extra whitespace
47
- self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
48
-
49
- def annotate(self, request_state: RequestState) -> Any:
50
- assert request_state.result
51
- assert len(request_state.result.completions) == 1
52
- model_input_text = request_state.request.prompt
53
- model_output_text = request_state.result.completions[0].text
54
- if not model_output_text.strip():
55
- return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
56
- correct_response = request_state.instance.references[0].output.text
57
- annotator_prompt = (
58
- PROMPT_TEMPLATE.strip()
59
- .replace("{{QUESTION}}", model_input_text)
60
- .replace("{{ANSWER}}", model_output_text)
61
- .replace("{{CORRECT_RESPONSE}}", correct_response)
62
- )
63
- result = score_with_reasoning(
64
- self._auto_client, annotator_prompt, "openai/gpt-4o-2024-05-13", "openai/gpt-4o-2024-05-13"
91
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
92
+ super().__init__(
93
+ auto_client=auto_client,
94
+ prompt_template=PROMPT_TEMPLATE,
95
+ annotation_criteria=ANNOTATION_CRITERIA,
96
+ annotator_models=ANNOTATOR_MODELS,
65
97
  )
66
- reasoning = result["reasoning"]
67
- score = result["score"]
68
- return {"prompt_text": annotator_prompt, "reasoning": reasoning, "score": score}
@@ -0,0 +1,98 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are a mental health expert tasked with evaluating the quality
8
+ of a counselor response in a therapy conversation.
9
+ Your goal is to assess how well the generated response demonstrates therapeutic skills,
10
+ and how it compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The conversation history and question will be provided in these tags:
13
+ <conversation>
14
+ {{QUESTION}}
15
+ </conversation>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ The reference response will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully analyze the <response> compared to the <gold_response> and the original <conversation>.
28
+ For each of the following criteria, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
29
+ and provide a short justification for your score.
30
+
31
+ Evaluation Criteria:
32
+ Accuracy (1-5)
33
+ - Does the response use appropriate therapeutic techniques and show understanding of the client's concerns?
34
+
35
+ Completeness (1-5)
36
+ - Does the response address the client's key concerns and include important context from the conversation?
37
+
38
+ Clarity (1-5)
39
+ - Is the response easy to understand and communicated in an empathetic, therapeutic way?
40
+
41
+ Output Format:
42
+ Output the evaluation as a single valid JSON object matching the following structure:
43
+ {
44
+ "accuracy": {
45
+ "score": 0,
46
+ "explanation": "Explain why this score was given."
47
+ },
48
+ "completeness": {
49
+ "score": 0,
50
+ "explanation": "Explain why this score was given."
51
+ },
52
+ "clarity": {
53
+ "score": 0,
54
+ "explanation": "Explain why this score was given."
55
+ }
56
+ }
57
+
58
+ Ensure the output is valid JSON:
59
+ - Use **double quotes** (") for all keys and string values.
60
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
61
+ maintain valid JSON formatting.
62
+ - Do not include any additional information in the output.
63
+ """
64
+
65
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
66
+ "accuracy": {"score", "explanation"},
67
+ "completeness": {"score", "explanation"},
68
+ "clarity": {"score", "explanation"},
69
+ }
70
+
71
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
72
+ "gpt": AnnotatorModelInfo(
73
+ model_name="openai/gpt-4o-2024-05-13",
74
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
75
+ ),
76
+ "llama": AnnotatorModelInfo(
77
+ model_name="meta/llama-3.3-70b-instruct",
78
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
79
+ ),
80
+ "claude": AnnotatorModelInfo(
81
+ model_name="anthropic/claude-3-7-sonnet-20250219",
82
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
83
+ ),
84
+ }
85
+
86
+
87
+ class MentalHealthAnnotator(LLMAsJuryAnnotator):
88
+ """The MentalHealth autograder."""
89
+
90
+ name = "mental_health"
91
+
92
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
93
+ super().__init__(
94
+ auto_client=auto_client,
95
+ prompt_template=PROMPT_TEMPLATE,
96
+ annotation_criteria=ANNOTATION_CRITERIA,
97
+ annotator_models=ANNOTATOR_MODELS,
98
+ )
@@ -0,0 +1,100 @@
1
+ from typing import Dict, Optional, Set
2
+
3
+ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
4
+ from helm.clients.auto_client import AutoClient
5
+
6
+
7
+ PROMPT_TEMPLATE = """You are tasked with evaluating the quality of the generated impression section
8
+ of a radiology report based on the provided findings.
9
+ Your goal is to assess how well the impression section captures the all the clinical findings and
10
+ how it compares to the gold response in terms of accuracy, completeness, and clarity.
11
+
12
+ The user's request will be provided in these tags:
13
+ <user_request>
14
+ {{QUESTION}}
15
+ </user_request>
16
+
17
+ The response will be provided in these tags:
18
+ <response>
19
+ {{RESPONSE}}
20
+ </response>
21
+
22
+ Some potential correct responses will be provided in these tags:
23
+ <gold_response>
24
+ {{GOLD_RESPONSE}}
25
+ </gold_response>
26
+
27
+ Carefully analyze the <response>.
28
+ For each of the following categories, rate the Response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
29
+ and provide a short justification for your score.
30
+
31
+ Your evaluation should focus on the following criteria:
32
+
33
+ Evaluation Criteria:
34
+ Accuracy (1-5)
35
+ - Does the impression correctly reflect the key findings from the radiology report?
36
+
37
+ Completeness (1-5)
38
+ - Does the impression include all important findings and address the clinical question?
39
+
40
+ Clarity (1-5)
41
+ - Is the impression easy for referring clinicians to understand?
42
+
43
+ Output Format:
44
+ Output the evaluation as a single valid JSON object matching the following structure:
45
+ {
46
+ "accuracy": {
47
+ "score": 0,
48
+ "explanation": "Explain why this score was given."
49
+ },
50
+ "completeness": {
51
+ "score": 0,
52
+ "explanation": "Explain why this score was given."
53
+ },
54
+ "clarity": {
55
+ "score": 0,
56
+ "explanation": "Explain why this score was given."
57
+ }
58
+ }
59
+
60
+ Ensure the output is valid JSON:
61
+ - Use **double quotes** (") for all keys and string values.
62
+ - When quoting text or sections inside the explanations, use escaped double quotes (\") to
63
+ maintain valid JSON formatting.
64
+ - Do not include any additional information in the output.
65
+ """
66
+
67
+ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
68
+ "accuracy": {"score", "explanation"},
69
+ "completeness": {"score", "explanation"},
70
+ "clarity": {"score", "explanation"},
71
+ }
72
+
73
+ ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
74
+ "gpt": AnnotatorModelInfo(
75
+ model_name="openai/gpt-4o-2024-05-13",
76
+ model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
77
+ ),
78
+ "llama": AnnotatorModelInfo(
79
+ model_name="meta/llama-3.3-70b-instruct",
80
+ model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
81
+ ),
82
+ "claude": AnnotatorModelInfo(
83
+ model_name="anthropic/claude-3-7-sonnet-20250219",
84
+ model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
85
+ ),
86
+ }
87
+
88
+
89
+ class MIMICRRSAnnotator(LLMAsJuryAnnotator):
90
+ """The MIMICRRS autograder."""
91
+
92
+ name = "mimic_rrs"
93
+
94
+ def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
95
+ super().__init__(
96
+ auto_client=auto_client,
97
+ prompt_template=PROMPT_TEMPLATE,
98
+ annotation_criteria=ANNOTATION_CRITERIA,
99
+ annotator_models=ANNOTATOR_MODELS,
100
+ )