crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show
  1. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
  2. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
  3. helm/benchmark/adaptation/adapter_spec.py +10 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/air_bench_annotator.py +1 -1
  8. helm/benchmark/annotation/alrage_annotator.py +90 -0
  9. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  10. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  11. helm/benchmark/annotation/live_qa_annotator.py +1 -1
  12. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  13. helm/benchmark/annotation/medalign_annotator.py +11 -22
  14. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  15. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  16. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  17. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  18. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  19. helm/benchmark/annotation/model_as_judge.py +23 -18
  20. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  21. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  22. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  23. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  24. helm/benchmark/metrics/alrage_metric.py +35 -0
  25. helm/benchmark/metrics/basic_metrics.py +267 -2
  26. helm/benchmark/metrics/bbq_metrics.py +12 -0
  27. helm/benchmark/metrics/classification_metrics.py +19 -1
  28. helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
  29. helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
  30. helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
  31. helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
  32. helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
  33. helm/benchmark/metrics/comet_metric.py +1 -1
  34. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  35. helm/benchmark/metrics/copyright_metrics.py +1 -1
  36. helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
  37. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  38. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  39. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  40. helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
  41. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  42. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  43. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  44. helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
  45. helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
  46. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  47. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  48. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  49. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  50. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  51. helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
  52. helm/benchmark/metrics/lmkt_metrics.py +47 -0
  53. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  54. helm/benchmark/metrics/medec_metrics.py +25 -2
  55. helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
  56. helm/benchmark/metrics/metric.py +25 -0
  57. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  58. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  59. helm/benchmark/metrics/safety_metrics.py +13 -1
  60. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  61. helm/benchmark/metrics/summac/model_summac.py +3 -3
  62. helm/benchmark/metrics/summarization_metrics.py +129 -1
  63. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  64. helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
  65. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  66. helm/benchmark/model_deployment_registry.py +11 -19
  67. helm/benchmark/presentation/create_plots.py +11 -2
  68. helm/benchmark/presentation/run_display.py +13 -3
  69. helm/benchmark/presentation/run_entry.py +2 -2
  70. helm/benchmark/presentation/schema.py +10 -22
  71. helm/benchmark/presentation/summarize.py +189 -14
  72. helm/benchmark/presentation/taxonomy_info.py +20 -0
  73. helm/benchmark/presentation/test_create_plots.py +4 -1
  74. helm/benchmark/run.py +15 -4
  75. helm/benchmark/run_expander.py +4 -0
  76. helm/benchmark/run_specs/arabic_run_specs.py +197 -0
  77. helm/benchmark/run_specs/bluex_run_specs.py +40 -0
  78. helm/benchmark/run_specs/classic_run_specs.py +2 -55
  79. helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
  80. helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
  81. helm/benchmark/run_specs/heim_run_specs.py +3 -1
  82. helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
  83. helm/benchmark/run_specs/long_context_run_specs.py +48 -1
  84. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  85. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  86. helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
  87. helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
  88. helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
  89. helm/benchmark/runner.py +7 -0
  90. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  91. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  92. helm/benchmark/scenarios/alghafa_scenario.py +126 -0
  93. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  94. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  95. helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
  96. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  97. helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
  98. helm/benchmark/scenarios/aratrust_scenario.py +95 -0
  99. helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
  100. helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
  101. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
  102. helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
  103. helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
  104. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
  105. helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
  106. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  107. helm/benchmark/scenarios/banking77_scenario.py +21 -0
  108. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  109. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  110. helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
  111. helm/benchmark/scenarios/bluex_scenario.py +70 -0
  112. helm/benchmark/scenarios/bold_scenario.py +15 -0
  113. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  114. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  115. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  116. helm/benchmark/scenarios/clear_scenario.py +23 -0
  117. helm/benchmark/scenarios/cleva_scenario.py +480 -1
  118. helm/benchmark/scenarios/code_scenario.py +28 -0
  119. helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
  120. helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
  121. helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
  122. helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
  123. helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
  124. helm/benchmark/scenarios/commonsense_scenario.py +32 -0
  125. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  126. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  127. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  128. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  129. helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
  130. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  131. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  132. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  133. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  134. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  135. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  136. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  137. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  138. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  139. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  140. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  141. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  142. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  143. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  144. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  145. helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
  146. helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
  147. helm/benchmark/scenarios/financebench_scenario.py +21 -0
  148. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  149. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  150. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  151. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  152. helm/benchmark/scenarios/gsm_scenario.py +21 -0
  153. helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
  154. helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
  155. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  156. helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
  157. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  158. helm/benchmark/scenarios/ice_scenario.py +21 -1
  159. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  160. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  161. helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
  162. helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
  163. helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
  164. helm/benchmark/scenarios/koala_scenario.py +21 -1
  165. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  166. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  167. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  168. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  169. helm/benchmark/scenarios/legalbench_scenario.py +19 -0
  170. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  171. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  172. helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
  173. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  175. helm/benchmark/scenarios/math_scenario.py +54 -20
  176. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  177. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  178. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  179. helm/benchmark/scenarios/med_qa_scenario.py +20 -0
  180. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  181. helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
  182. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  183. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  184. helm/benchmark/scenarios/medec_scenario.py +23 -0
  185. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  186. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  187. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  188. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  189. helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
  190. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  191. helm/benchmark/scenarios/melt_scenarios.py +2 -2
  192. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  193. helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
  194. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  195. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  196. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  197. helm/benchmark/scenarios/mmlu_scenario.py +21 -0
  198. helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
  199. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  200. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  201. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  202. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  203. helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
  204. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  205. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  206. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  207. helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
  208. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  209. helm/benchmark/scenarios/quac_scenario.py +14 -0
  210. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  211. helm/benchmark/scenarios/raft_scenario.py +15 -0
  212. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  213. helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
  214. helm/benchmark/scenarios/scenario.py +31 -0
  215. helm/benchmark/scenarios/seahelm_scenario.py +350 -2
  216. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  217. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  218. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  219. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  220. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  221. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  222. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  223. helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
  224. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  225. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  226. helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
  227. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  228. helm/benchmark/scenarios/spider_scenario.py +18 -0
  229. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  230. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  231. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  232. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  233. helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
  234. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  235. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  236. helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
  237. helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
  238. helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
  239. helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
  240. helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
  241. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  242. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  243. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  244. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  245. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  246. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  247. helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
  248. helm/benchmark/slurm_jobs.py +1 -2
  249. helm/benchmark/slurm_runner.py +8 -1
  250. helm/benchmark/static/schema_arabic.yaml +271 -0
  251. helm/benchmark/static/schema_classic.yaml +0 -17
  252. helm/benchmark/static/schema_long_context.yaml +17 -18
  253. helm/benchmark/static/schema_medhelm.yaml +36 -0
  254. helm/benchmark/static/schema_slp.yaml +219 -0
  255. helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
  256. helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
  257. helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
  258. helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
  259. helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
  260. helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
  261. helm/benchmark/static_build/index.html +5 -6
  262. helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
  263. helm/clients/ai21_client.py +2 -0
  264. helm/clients/aleph_alpha_client.py +2 -0
  265. helm/clients/anthropic_client.py +7 -1
  266. helm/clients/audio_language/diva_llama_client.py +2 -0
  267. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  268. helm/clients/audio_language/llama_omni/constants.py +9 -0
  269. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  270. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  271. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  272. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  273. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  274. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  275. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  276. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  277. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  278. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  279. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  280. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  281. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  282. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  283. helm/clients/audio_language/llama_omni/utils.py +202 -0
  284. helm/clients/audio_language/llama_omni_client.py +2 -1
  285. helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
  286. helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
  287. helm/clients/audio_language/qwen_audiolm_client.py +2 -1
  288. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  289. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  290. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  291. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  292. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  293. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  294. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  295. helm/clients/bedrock_client.py +63 -6
  296. helm/clients/cohere_client.py +3 -0
  297. helm/clients/dspy_client.py +135 -0
  298. helm/clients/google_client.py +2 -0
  299. helm/clients/http_model_client.py +2 -0
  300. helm/clients/huggingface_client.py +4 -3
  301. helm/clients/ibm_client.py +3 -1
  302. helm/clients/image_generation/adobe_vision_client.py +2 -0
  303. helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
  304. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  305. helm/clients/image_generation/cogview2_client.py +2 -1
  306. helm/clients/image_generation/dalle2_client.py +2 -0
  307. helm/clients/image_generation/dalle_mini_client.py +2 -1
  308. helm/clients/image_generation/deep_floyd_client.py +2 -0
  309. helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
  310. helm/clients/image_generation/lexica_client.py +2 -0
  311. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  312. helm/clients/image_generation/mindalle_client.py +2 -1
  313. helm/clients/image_generation/together_image_generation_client.py +2 -0
  314. helm/clients/megatron_client.py +2 -0
  315. helm/clients/mistral_client.py +2 -0
  316. helm/clients/moderation_api_client.py +2 -0
  317. helm/clients/openai_client.py +38 -21
  318. helm/clients/openai_responses_client.py +34 -8
  319. helm/clients/openrouter_client.py +31 -0
  320. helm/clients/palmyra_client.py +2 -1
  321. helm/clients/reka_client.py +2 -1
  322. helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
  323. helm/clients/stanfordhealthcare_http_model_client.py +2 -0
  324. helm/clients/test_huggingface_client.py +3 -3
  325. helm/clients/test_openrouter_client.py +69 -0
  326. helm/clients/together_client.py +52 -13
  327. helm/clients/vertexai_client.py +23 -11
  328. helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
  329. helm/clients/vision_language/huggingface_vlm_client.py +2 -0
  330. helm/clients/vision_language/idefics_client.py +2 -1
  331. helm/clients/vision_language/open_flamingo_client.py +2 -1
  332. helm/clients/vision_language/paligemma_client.py +2 -1
  333. helm/clients/vision_language/palmyra_vision_client.py +2 -0
  334. helm/clients/vision_language/qwen2_vlm_client.py +2 -1
  335. helm/clients/vision_language/qwen_vlm_client.py +2 -1
  336. helm/clients/vllm_client.py +43 -7
  337. helm/clients/vllm_granite_thinking_client.py +56 -0
  338. helm/clients/writer_client.py +5 -2
  339. helm/common/critique_request.py +0 -1
  340. helm/common/hierarchical_logger.py +103 -34
  341. helm/common/object_spec.py +23 -8
  342. helm/common/optional_dependencies.py +1 -1
  343. helm/common/test_general.py +4 -0
  344. helm/common/test_logging.py +94 -0
  345. helm/config/model_deployments.yaml +1001 -187
  346. helm/config/model_metadata.yaml +602 -18
  347. helm/config/tokenizer_configs.yaml +202 -5
  348. helm/proxy/cli.py +1 -1
  349. helm/proxy/example_queries.py +8 -8
  350. helm/proxy/retry.py +5 -0
  351. helm/proxy/server.py +2 -1
  352. helm/proxy/static/index.css +4 -0
  353. helm/proxy/static/index.js +7 -1
  354. helm/tokenizers/auto_tokenizer.py +2 -2
  355. helm/tokenizers/grok_tokenizer.py +2 -0
  356. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  357. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  358. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  359. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  360. helm/benchmark/metrics/medalign_metrics.py +0 -14
  361. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  362. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  363. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  364. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  365. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  366. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  367. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  368. helm/benchmark/metrics/numeracy_metrics.py +0 -72
  369. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  370. helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
  371. helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
  372. helm/benchmark/scenarios/numeracy_scenario.py +0 -794
  373. helm/benchmark/static_build/assets/index-94295e78.js +0 -10
  374. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  375. helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
  376. helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
  377. helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
  378. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
  379. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
  380. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
  381. {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
  382. /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
  383. /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
  384. /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
  385. /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
  386. /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
  387. /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
  388. /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
  389. /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
  390. /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
  391. /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
  392. /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
  393. /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
  394. /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
@@ -63,6 +63,11 @@ class AdapterSpec:
63
63
  reference_prefix: str = "A. "
64
64
  """The string that is included before each reference (for multiple-choice questions)."""
65
65
 
66
+ # Set hash=False to make `AdapterSpec` hashable
67
+ reference_prefix_characters: Optional[List[str]] = field(default=None, hash=False)
68
+ """The characters that are used to identify choices for multiple-choice questions e.g. ["A", "B", "C", "D"].
69
+ If unset, defaults to the sequence of ascending characters starting from the first character of reference_prefix."""
70
+
66
71
  reference_suffix: str = "\n"
67
72
  """The string that is included after each reference (for multiple-choice questions)."""
68
73
 
@@ -139,3 +144,8 @@ class AdapterSpec:
139
144
  # Set hash=False to make `AdapterSpec` hashable
140
145
  eval_splits: Optional[List[str]] = field(default=None, hash=False)
141
146
  """The splits from which evaluation instances will be drawn."""
147
+
148
+ output_mapping_pattern: Optional[str] = None
149
+ """Pattern to apply to the output before applying the output mapping for the joint multiple choice adapter.
150
+ If the pattern has no group, the output mapping will be applied to the first match.
151
+ If the pattern has a group, the output mapping will be applied to the group of the first match."""
@@ -18,12 +18,20 @@ class MultipleChoiceJointMultimodalAdapter(InContextLearningMultimodalAdapter, A
18
18
  learning for multimodal models.
19
19
  """
20
20
 
21
- @staticmethod
22
- def get_reference_prefix(prefix: str, i: int) -> str:
21
+ def get_prefix_char(self, prefix: str) -> str:
22
+ return [char for char in prefix if char.isalnum()][0]
23
+
24
+ def get_reference_prefix(self, prefix: str, i: int) -> str:
23
25
  """
24
26
  Example: prefix = "\nA. ", i = 2, return "\nC. "
25
27
  """
26
- return prefix.replace("A", chr(ord("A") + i))
28
+ old_prefix_char = self.get_prefix_char(prefix)
29
+ new_prefix_char = (
30
+ self.adapter_spec.reference_prefix_characters[i]
31
+ if self.adapter_spec.reference_prefix_characters
32
+ else chr(ord(old_prefix_char) + i)
33
+ )
34
+ return prefix.replace(old_prefix_char, new_prefix_char)
27
35
 
28
36
  def generate_requests(
29
37
  self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
@@ -38,22 +38,25 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
38
38
  <input_prefix><input><reference_prefixes[index]><reference><output_prefix><output>
39
39
  """
40
40
 
41
- @staticmethod
42
- def get_prefix_char(prefix: str) -> str:
41
+ def get_prefix_char(self, prefix: str) -> str:
43
42
  return [char for char in prefix if char.isalnum()][0]
44
43
 
45
- @staticmethod
46
- def get_reference_prefix(prefix: str, i: int) -> str:
44
+ def get_reference_prefix(self, prefix: str, i: int) -> str:
47
45
  """
48
46
  Example: prefix = "\nA. ", i = 2, return "\nC. "
49
47
  """
50
- prefix_char = MultipleChoiceJointAdapter.get_prefix_char(prefix)
51
- return prefix.replace(prefix_char, chr(ord(prefix_char) + i))
48
+ old_prefix_char = self.get_prefix_char(prefix)
49
+ new_prefix_char = (
50
+ self.adapter_spec.reference_prefix_characters[i]
51
+ if self.adapter_spec.reference_prefix_characters
52
+ else chr(ord(old_prefix_char) + i)
53
+ )
54
+ return prefix.replace(old_prefix_char, new_prefix_char)
52
55
 
53
56
  def generate_requests(
54
57
  self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
55
58
  ) -> List[RequestState]:
56
- prefix_char = MultipleChoiceJointAdapter.get_prefix_char(self.adapter_spec.reference_prefix)
59
+ prefix_char = self.get_prefix_char(self.adapter_spec.reference_prefix)
57
60
  prompt = self.construct_prompt(training_instances, eval_instance, include_output=False, reference_index=None)
58
61
  output_mapping: Dict[str, str] = dict(
59
62
  (self.get_reference_prefix(prefix_char, reference_index), reference.output.text)
@@ -91,7 +94,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
91
94
  # Include the references
92
95
  delimiter = ", "
93
96
  no_correct_references = "n/a"
94
- prefix_char = MultipleChoiceJointAdapter.get_prefix_char(self.adapter_spec.reference_prefix)
97
+ prefix_char = self.get_prefix_char(self.adapter_spec.reference_prefix)
95
98
  output = no_correct_references
96
99
  for reference_index, reference in enumerate(instance.references):
97
100
  prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)
@@ -10,15 +10,15 @@ Your goal is to assess how well the note captures the clinical information from
10
10
  compare it to the reference note (gold standard) in terms of accruacy, completeness and clarity.
11
11
  The conversation will be provided in these tags:
12
12
  <conversation>
13
- {{QUESTION}}
13
+ {QUESTION}
14
14
  </conversation>
15
15
  The generated note will be provided in these tags:
16
16
  <response>
17
- {{RESPONSE}}
17
+ {RESPONSE}
18
18
  </response>
19
19
  The reference note will be provided in these tags:
20
20
  <gold_response>
21
- {{GOLD_RESPONSE}}
21
+ {GOLD_RESPONSE}
22
22
  </gold_response>
23
23
  Carefully review the <response> based on the <conversation> and compare it to the <gold_response> when needed.
24
24
 
@@ -65,31 +65,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
65
65
  "clarity": {"score", "explanation"},
66
66
  }
67
67
 
68
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
69
- "gpt": AnnotatorModelInfo(
70
- model_name="openai/gpt-4o-2024-05-13",
71
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
72
- ),
73
- "llama": AnnotatorModelInfo(
74
- model_name="meta/llama-3.3-70b-instruct",
75
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
76
- ),
77
- "claude": AnnotatorModelInfo(
78
- model_name="anthropic/claude-3-7-sonnet-20250219",
79
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
80
- ),
81
- }
82
-
83
68
 
84
69
  class ACIBenchAnnotator(LLMAsJuryAnnotator):
85
70
  """The ACIBench autograder."""
86
71
 
87
- name = "aci_bench"
88
-
89
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
72
+ def __init__(
73
+ self,
74
+ auto_client: AutoClient,
75
+ annotator_models: Dict[str, AnnotatorModelInfo],
76
+ template_name: Optional[str] = None,
77
+ ):
90
78
  super().__init__(
79
+ name="aci_bench",
91
80
  auto_client=auto_client,
92
81
  prompt_template=PROMPT_TEMPLATE,
93
82
  annotation_criteria=ANNOTATION_CRITERIA,
94
- annotator_models=ANNOTATOR_MODELS,
83
+ annotator_models=annotator_models,
95
84
  )
@@ -37,7 +37,7 @@ class AIRBench2024Annotator(Annotator):
37
37
  )
38
38
  self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
39
39
  # Regex pattern is lenient to allow for typos e.g. extra whitespace
40
- self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
40
+ self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
41
41
  self._model = model or self._DEFAULT_MODEL
42
42
  self._model_deployment = model_deployment or self._DEFAULT_MODEL_DEPLOYMENT
43
43
 
@@ -0,0 +1,90 @@
1
+ from typing import Any
2
+
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.annotation.annotator import Annotator
5
+ from helm.clients.auto_client import AutoClient
6
+ from helm.common.hierarchical_logger import hwarn
7
+ from helm.common.request import Request
8
+
9
+
10
+ class ALRAGEAnnotator(Annotator):
11
+ """The ALRAGEAnnotator autograder."""
12
+
13
+ name = "alrage"
14
+
15
+ _ANNOTATOR_MODEL = "openai/gpt-4o-2024-11-20"
16
+
17
+ def __init__(self, auto_client: AutoClient):
18
+ self._auto_client = auto_client
19
+
20
+ def _apply_annotator_template(self, question: str, answer: str, gold: str) -> Any:
21
+ return [
22
+ {
23
+ "role": "system",
24
+ "content": """أنت مقيّم محايد خبير باللغة العربية. يجب عليك:
25
+ 1. تقييم دقة الإجابة مقارنة بالإجابة الصحيحة
26
+ 2. التحقق من أن الإجابة مدعومة بالسياق المقدم
27
+ 3. تقييم جودة وشمولية الإجابة
28
+
29
+ مهم جداً: يجب أن يكون ردك رقماً فقط من 0 إلى 10. لا تضف أي نص أو تفسير.""",
30
+ },
31
+ {
32
+ "role": "user",
33
+ "content": f"""السؤال: {question}
34
+
35
+ الإجابة المقدمة: {answer}
36
+
37
+ الإجابة الصحيحة: {gold}
38
+
39
+ أعط تقييماً من 0 إلى 10:
40
+ 0-2: إجابة خاطئة تماماً
41
+ 3-4: إجابة جزئية مع أخطاء
42
+ 5-6: إجابة متوسطة
43
+ 7-8: إجابة جيدة
44
+ 9-10: إجابة ممتازة
45
+
46
+ اكتب رقماً فقط من 0 إلى 10 بدون أي نص إضافي:""",
47
+ },
48
+ ]
49
+
50
+ def _parse_annotator_response(self, response: str) -> float:
51
+ """Process the judge's response to extract the score"""
52
+ try:
53
+ # Extract the first number from the response content
54
+ score = float(next(num for num in response.split() if num.replace(".", "", 1).isdigit()))
55
+ return min(max(score / 10.0, 0.0), 1.0)
56
+
57
+ except Exception as e:
58
+ hwarn(f"Error while processing judge response: {e}")
59
+ return 0.0
60
+
61
+ def annotate(self, request_state: RequestState) -> Any:
62
+ question = request_state.instance.input.text
63
+ assert request_state.result
64
+ assert len(request_state.result.completions) == 1
65
+ answer = request_state.result.completions[0].text
66
+ assert len(request_state.instance.all_correct_references) == 1
67
+ gold = request_state.instance.all_correct_references[0].output.text
68
+ messages = self._apply_annotator_template(question, answer, gold)
69
+ judge_request = Request(
70
+ model=self._ANNOTATOR_MODEL,
71
+ model_deployment=self._ANNOTATOR_MODEL,
72
+ messages=messages,
73
+ temperature=0.0,
74
+ max_tokens=2000,
75
+ )
76
+ judge_response = self._auto_client.make_request(judge_request)
77
+ if not judge_response.success:
78
+ raise Exception(
79
+ "ALRAGEAnnotator got an error response from " f"{self._ANNOTATOR_MODEL}: {judge_response.error}"
80
+ )
81
+ assert len(judge_response.completions) == 1
82
+ prompt = messages[-1]["content"]
83
+ response = judge_response.completions[0].text
84
+ score = self._parse_annotator_response(response)
85
+
86
+ return {
87
+ "prompt": prompt,
88
+ "response": response,
89
+ "score": score,
90
+ }
@@ -11,12 +11,12 @@ and follows provided instructions in terms of accuracy, structure, and clarity.
11
11
 
12
12
  The user's request will be provided in these tags:
13
13
  <user_request>
14
- {{QUESTION}}
14
+ {QUESTION}
15
15
  </user_request>
16
16
 
17
17
  The response will be provided in these tags:
18
18
  <response>
19
- {{RESPONSE}}
19
+ {RESPONSE}
20
20
  </response>
21
21
 
22
22
  Carefully analyze the <response>.
@@ -63,31 +63,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
63
63
  "clarity": {"score", "explanation"},
64
64
  }
65
65
 
66
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
67
- "gpt": AnnotatorModelInfo(
68
- model_name="openai/gpt-4o-2024-05-13",
69
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
70
- ),
71
- "llama": AnnotatorModelInfo(
72
- model_name="meta/llama-3.3-70b-instruct",
73
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
74
- ),
75
- "claude": AnnotatorModelInfo(
76
- model_name="anthropic/claude-3-7-sonnet-20250219",
77
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
78
- ),
79
- }
80
-
81
66
 
82
67
  class CHWCarePlanAnnotator(LLMAsJuryAnnotator):
83
68
  """The CHWCarePlan autograder."""
84
69
 
85
- name = "chw_care_plan"
86
-
87
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
70
+ def __init__(
71
+ self,
72
+ auto_client: AutoClient,
73
+ annotator_models: Dict[str, AnnotatorModelInfo],
74
+ template_name: Optional[str] = None,
75
+ ):
88
76
  super().__init__(
77
+ name="chw_care_plan",
89
78
  auto_client=auto_client,
90
79
  prompt_template=PROMPT_TEMPLATE,
91
80
  annotation_criteria=ANNOTATION_CRITERIA,
92
- annotator_models=ANNOTATOR_MODELS,
81
+ annotator_models=annotator_models,
93
82
  )
@@ -15,19 +15,19 @@ gold response in terms of accuracy, completeness, and clarity.
15
15
  The target task of either generating a discharge instruction or brief hospital course along with
16
16
  the patient discharge text and radiology report will be provided in these tags:
17
17
  <patient_information>
18
- {{QUESTION}}
18
+ {QUESTION}
19
19
  </patient_information>
20
20
 
21
21
 
22
22
  The document will be provided in these tags:
23
23
  <response>
24
- {{RESPONSE}}
24
+ {RESPONSE}
25
25
  </response>
26
26
 
27
27
  The gold standard target document (either discharge instructions or a brief hospital course)
28
28
  will be provided in these tags:
29
29
  <gold_response>
30
- {{GOLD_RESPONSE}}
30
+ {GOLD_RESPONSE}
31
31
  </gold_response>
32
32
 
33
33
  Carefully analyze the <response> based on the <patient_information> and compare
@@ -77,31 +77,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
77
77
  "clarity": {"score", "explanation"},
78
78
  }
79
79
 
80
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
81
- "gpt": AnnotatorModelInfo(
82
- model_name="openai/gpt-4o-2024-05-13",
83
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
84
- ),
85
- "llama": AnnotatorModelInfo(
86
- model_name="meta/llama-3.3-70b-instruct",
87
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
88
- ),
89
- "claude": AnnotatorModelInfo(
90
- model_name="anthropic/claude-3-7-sonnet-20250219",
91
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
92
- ),
93
- }
94
-
95
80
 
96
81
  class DischargeMeAnnotator(LLMAsJuryAnnotator):
97
82
  """The DischargeMe autograder."""
98
83
 
99
- name = "dischargeme"
100
-
101
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
84
+ def __init__(
85
+ self,
86
+ auto_client: AutoClient,
87
+ annotator_models: Dict[str, AnnotatorModelInfo],
88
+ template_name: Optional[str] = None,
89
+ ):
102
90
  super().__init__(
91
+ name="dischargeme",
103
92
  auto_client=auto_client,
104
93
  prompt_template=PROMPT_TEMPLATE,
105
94
  annotation_criteria=ANNOTATION_CRITERIA,
106
- annotator_models=ANNOTATOR_MODELS,
95
+ annotator_models=annotator_models,
107
96
  )
@@ -50,7 +50,7 @@ class LiveQAAnnotator(Annotator):
50
50
  cache_dir = os.path.join(file_storage_path, "data")
51
51
  ensure_directory_exists(cache_dir)
52
52
  # Regex pattern is lenient to allow for typos e.g. extra whitespace
53
- self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
53
+ self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
54
54
 
55
55
  def annotate(self, request_state: RequestState) -> Any:
56
56
  assert request_state.result
@@ -11,17 +11,17 @@ and how it compares to the gold response in terms of accuracy, completeness, and
11
11
 
12
12
  The patient-doctor conversation will be provided in these tags:
13
13
  <conversation>
14
- {{QUESTION}}
14
+ {QUESTION}
15
15
  </conversation>
16
16
 
17
17
  The response will be provided in these tags:
18
18
  <response>
19
- {{RESPONSE}}
19
+ {RESPONSE}
20
20
  </response>
21
21
 
22
22
  The reference response will be provided in these tags:
23
23
  <gold_response>
24
- {{GOLD_RESPONSE}}
24
+ {GOLD_RESPONSE}
25
25
  </gold_response>
26
26
 
27
27
  Carefully review the <response> and compare it to the <gold_response> when needed.
@@ -69,31 +69,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
69
69
  "clarity": {"score", "explanation"},
70
70
  }
71
71
 
72
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
73
- "gpt": AnnotatorModelInfo(
74
- model_name="openai/gpt-4o-2024-05-13",
75
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
76
- ),
77
- "llama": AnnotatorModelInfo(
78
- model_name="meta/llama-3.3-70b-instruct",
79
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
80
- ),
81
- "claude": AnnotatorModelInfo(
82
- model_name="anthropic/claude-3-7-sonnet-20250219",
83
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
84
- ),
85
- }
86
-
87
72
 
88
73
  class MedDialogAnnotator(LLMAsJuryAnnotator):
89
74
  """The MedDialog autograder."""
90
75
 
91
- name = "med_dialog"
92
-
93
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
76
+ def __init__(
77
+ self,
78
+ auto_client: AutoClient,
79
+ annotator_models: Dict[str, AnnotatorModelInfo],
80
+ template_name: Optional[str] = None,
81
+ ):
94
82
  super().__init__(
83
+ name="med_dialog",
95
84
  auto_client=auto_client,
96
85
  prompt_template=PROMPT_TEMPLATE,
97
86
  annotation_criteria=ANNOTATION_CRITERIA,
98
- annotator_models=ANNOTATOR_MODELS,
87
+ annotator_models=annotator_models,
99
88
  )
@@ -12,17 +12,17 @@ and aligns with the gold response in terms of accuracy, completeness, and clarit
12
12
 
13
13
  The instruction and EHR pair will be provided in these tags:
14
14
  <user_request>
15
- {{QUESTION}}
15
+ {QUESTION}
16
16
  </user_request>
17
17
 
18
18
  The response will be provided in these tags:
19
19
  <response>
20
- {{RESPONSE}}
20
+ {RESPONSE}
21
21
  </response>
22
22
 
23
23
  The gold response (reference answer) will be provided in these tags:
24
24
  <gold_response>
25
- {{GOLD_RESPONSE}}
25
+ {GOLD_RESPONSE}
26
26
  </gold_response>
27
27
 
28
28
  Carefully review the <response> based on the <user_request> and compare it to
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
70
70
  "clarity": {"score", "explanation"},
71
71
  }
72
72
 
73
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
74
- "gpt": AnnotatorModelInfo(
75
- model_name="openai/gpt-4o-2024-05-13",
76
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
77
- ),
78
- "llama": AnnotatorModelInfo(
79
- model_name="meta/llama-3.3-70b-instruct",
80
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
81
- ),
82
- "claude": AnnotatorModelInfo(
83
- model_name="anthropic/claude-3-7-sonnet-20250219",
84
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
85
- ),
86
- }
87
-
88
73
 
89
74
  class MedalignAnnotator(LLMAsJuryAnnotator):
90
75
  """The Medalign autograder."""
91
76
 
92
- name = "medalign"
93
-
94
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
77
+ def __init__(
78
+ self,
79
+ auto_client: AutoClient,
80
+ annotator_models: Dict[str, AnnotatorModelInfo],
81
+ template_name: Optional[str] = None,
82
+ ):
95
83
  super().__init__(
84
+ name="medalign",
96
85
  auto_client=auto_client,
97
86
  prompt_template=PROMPT_TEMPLATE,
98
87
  annotation_criteria=ANNOTATION_CRITERIA,
99
- annotator_models=ANNOTATOR_MODELS,
88
+ annotator_models=annotator_models,
100
89
  )
@@ -11,17 +11,17 @@ and how it compares to the gold response in terms of accuracy, completeness, and
11
11
 
12
12
  The question will be provided in these tags:
13
13
  <question>
14
- {{QUESTION}}
14
+ {QUESTION}
15
15
  </question>
16
16
 
17
17
  The response will be provided in these tags:
18
18
  <response>
19
- {{RESPONSE}}
19
+ {RESPONSE}
20
20
  </response>
21
21
 
22
22
  The reference answer will be provided in these tags:
23
23
  <gold_response>
24
- {{GOLD_RESPONSE}}
24
+ {GOLD_RESPONSE}
25
25
  </gold_response>
26
26
 
27
27
  Carefully analyze the <response> compared to the <gold_response> and the original <question>.
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
68
68
  "clarity": {"score", "explanation"},
69
69
  }
70
70
 
71
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
72
- "gpt": AnnotatorModelInfo(
73
- model_name="openai/gpt-4o-2024-05-13",
74
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
75
- ),
76
- "llama": AnnotatorModelInfo(
77
- model_name="meta/llama-3.3-70b-instruct",
78
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
79
- ),
80
- "claude": AnnotatorModelInfo(
81
- model_name="anthropic/claude-3-7-sonnet-20250219",
82
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
83
- ),
84
- }
85
-
86
71
 
87
72
  class MediQAAnnotator(LLMAsJuryAnnotator):
88
73
  """The MediQA autograder."""
89
74
 
90
- name = "medi_qa"
91
-
92
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
75
+ def __init__(
76
+ self,
77
+ auto_client: AutoClient,
78
+ annotator_models: Dict[str, AnnotatorModelInfo],
79
+ template_name: Optional[str] = None,
80
+ ):
93
81
  super().__init__(
82
+ name="medi_qa",
94
83
  auto_client=auto_client,
95
84
  prompt_template=PROMPT_TEMPLATE,
96
85
  annotation_criteria=ANNOTATION_CRITERIA,
97
- annotator_models=ANNOTATOR_MODELS,
86
+ annotator_models=annotator_models,
98
87
  )
@@ -11,17 +11,17 @@ and how it compares to the gold response in terms of accuracy, completeness, and
11
11
 
12
12
  The question provided in these tags:
13
13
  <medication_question>
14
- {{QUESTION}}
14
+ {QUESTION}
15
15
  </medication_question>
16
16
 
17
17
  The response will be provided in these tags:
18
18
  <response>
19
- {{RESPONSE}}
19
+ {RESPONSE}
20
20
  </response>
21
21
 
22
22
  The reference response will be provided in these tags:
23
23
  <gold_response>
24
- {{GOLD_RESPONSE}}
24
+ {GOLD_RESPONSE}
25
25
  </gold_response>
26
26
 
27
27
  Carefully review the <response> and compare it to the <gold_response> when needed.
@@ -67,31 +67,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
67
67
  "clarity": {"score", "explanation"},
68
68
  }
69
69
 
70
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
71
- "gpt": AnnotatorModelInfo(
72
- model_name="openai/gpt-4o-2024-05-13",
73
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
74
- ),
75
- "llama": AnnotatorModelInfo(
76
- model_name="meta/llama-3.3-70b-instruct",
77
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
78
- ),
79
- "claude": AnnotatorModelInfo(
80
- model_name="anthropic/claude-3-7-sonnet-20250219",
81
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
82
- ),
83
- }
84
-
85
70
 
86
71
  class MedicationQAAnnotator(LLMAsJuryAnnotator):
87
72
  """The MedicationQA autograder."""
88
73
 
89
- name = "medication_qa"
90
-
91
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
74
+ def __init__(
75
+ self,
76
+ auto_client: AutoClient,
77
+ annotator_models: Dict[str, AnnotatorModelInfo],
78
+ template_name: Optional[str] = None,
79
+ ):
92
80
  super().__init__(
81
+ name="medication_qa",
93
82
  auto_client=auto_client,
94
83
  prompt_template=PROMPT_TEMPLATE,
95
84
  annotation_criteria=ANNOTATION_CRITERIA,
96
- annotator_models=ANNOTATOR_MODELS,
85
+ annotator_models=annotator_models,
97
86
  )