crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show
  1. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
  2. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
  3. helm/benchmark/adaptation/adapter_spec.py +5 -0
  4. helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
  5. helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
  6. helm/benchmark/annotation/aci_bench_annotator.py +11 -22
  7. helm/benchmark/annotation/alrage_annotator.py +90 -0
  8. helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
  9. helm/benchmark/annotation/dischargeme_annotator.py +11 -22
  10. helm/benchmark/annotation/med_dialog_annotator.py +11 -22
  11. helm/benchmark/annotation/medalign_annotator.py +11 -22
  12. helm/benchmark/annotation/medi_qa_annotator.py +11 -22
  13. helm/benchmark/annotation/medication_qa_annotator.py +11 -22
  14. helm/benchmark/annotation/mental_health_annotator.py +11 -22
  15. helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
  16. helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
  17. helm/benchmark/annotation/model_as_judge.py +23 -18
  18. helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
  19. helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
  20. helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
  21. helm/benchmark/metrics/air_bench_metrics.py +3157 -1
  22. helm/benchmark/metrics/alrage_metric.py +35 -0
  23. helm/benchmark/metrics/basic_metrics.py +267 -2
  24. helm/benchmark/metrics/classification_metrics.py +19 -1
  25. helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
  26. helm/benchmark/metrics/dry_run_metrics.py +30 -1
  27. helm/benchmark/metrics/efficiency_metrics.py +74 -0
  28. helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
  29. helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
  30. helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
  31. helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
  32. helm/benchmark/metrics/ifeval_metrics.py +13 -1
  33. helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
  34. helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
  35. helm/benchmark/metrics/language_modeling_metrics.py +13 -1
  36. helm/benchmark/metrics/live_qa_metrics.py +13 -1
  37. helm/benchmark/metrics/llm_jury_metrics.py +13 -1
  38. helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
  39. helm/benchmark/metrics/medec_metrics.py +25 -2
  40. helm/benchmark/metrics/metric.py +25 -0
  41. helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
  42. helm/benchmark/metrics/omni_math_metrics.py +13 -1
  43. helm/benchmark/metrics/seahelm_metrics.py +14 -1
  44. helm/benchmark/metrics/summac/model_summac.py +2 -2
  45. helm/benchmark/metrics/summarization_metrics.py +129 -1
  46. helm/benchmark/metrics/toxicity_metrics.py +31 -1
  47. helm/benchmark/metrics/wildbench_metrics.py +21 -1
  48. helm/benchmark/presentation/schema.py +5 -22
  49. helm/benchmark/presentation/summarize.py +180 -11
  50. helm/benchmark/presentation/taxonomy_info.py +20 -0
  51. helm/benchmark/run_expander.py +4 -0
  52. helm/benchmark/run_specs/arabic_run_specs.py +134 -16
  53. helm/benchmark/run_specs/bluex_run_specs.py +1 -1
  54. helm/benchmark/run_specs/classic_run_specs.py +2 -2
  55. helm/benchmark/run_specs/long_context_run_specs.py +2 -2
  56. helm/benchmark/run_specs/medhelm/__init__.py +0 -0
  57. helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
  58. helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
  59. helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
  60. helm/benchmark/scenarios/air_bench_scenario.py +21 -0
  61. helm/benchmark/scenarios/alrage_scenario.py +54 -0
  62. helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
  63. helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
  64. helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
  65. helm/benchmark/scenarios/aratrust_scenario.py +19 -0
  66. helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
  67. helm/benchmark/scenarios/bbq_scenario.py +15 -0
  68. helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
  69. helm/benchmark/scenarios/bluex_scenario.py +6 -2
  70. helm/benchmark/scenarios/bold_scenario.py +15 -0
  71. helm/benchmark/scenarios/boolq_scenario.py +20 -0
  72. helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
  73. helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
  74. helm/benchmark/scenarios/clear_scenario.py +23 -0
  75. helm/benchmark/scenarios/cleva_scenario.py +479 -0
  76. helm/benchmark/scenarios/code_scenario.py +28 -0
  77. helm/benchmark/scenarios/commonsense_scenario.py +26 -0
  78. helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
  79. helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
  80. helm/benchmark/scenarios/copyright_scenario.py +35 -1
  81. helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
  82. helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
  83. helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
  84. helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
  85. helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
  86. helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
  87. helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
  88. helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
  89. helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
  90. helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
  91. helm/benchmark/scenarios/disinformation_scenario.py +22 -0
  92. helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
  93. helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
  94. helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
  95. helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
  96. helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
  97. helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
  98. helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
  99. helm/benchmark/scenarios/gpqa_scenario.py +18 -0
  100. helm/benchmark/scenarios/grammar_scenario.py +20 -1
  101. helm/benchmark/scenarios/gsm_scenario.py +15 -0
  102. helm/benchmark/scenarios/headqa_scenario.py +22 -0
  103. helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
  104. helm/benchmark/scenarios/ice_scenario.py +21 -1
  105. helm/benchmark/scenarios/ifeval_scenario.py +18 -0
  106. helm/benchmark/scenarios/imdb_scenario.py +15 -0
  107. helm/benchmark/scenarios/koala_scenario.py +21 -1
  108. helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
  109. helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
  110. helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
  111. helm/benchmark/scenarios/legal_support_scenario.py +13 -0
  112. helm/benchmark/scenarios/legalbench_scenario.py +20 -0
  113. helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
  114. helm/benchmark/scenarios/lextreme_scenario.py +11 -0
  115. helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
  116. helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
  117. helm/benchmark/scenarios/math_scenario.py +26 -0
  118. helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
  119. helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
  120. helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
  121. helm/benchmark/scenarios/med_qa_scenario.py +14 -0
  122. helm/benchmark/scenarios/medalign_scenario.py +23 -0
  123. helm/benchmark/scenarios/medbullets_scenario.py +22 -0
  124. helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
  125. helm/benchmark/scenarios/medec_scenario.py +23 -0
  126. helm/benchmark/scenarios/medhallu_scenario.py +23 -0
  127. helm/benchmark/scenarios/medhelm/__init__.py +0 -0
  128. helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
  129. helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
  130. helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
  131. helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
  132. helm/benchmark/scenarios/mental_health_scenario.py +23 -0
  133. helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
  134. helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
  135. helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
  136. helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
  137. helm/benchmark/scenarios/mmlu_scenario.py +15 -0
  138. helm/benchmark/scenarios/msmarco_scenario.py +30 -0
  139. helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
  140. helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
  141. helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
  142. helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
  143. helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
  144. helm/benchmark/scenarios/omni_math_scenario.py +18 -0
  145. helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
  146. helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
  147. helm/benchmark/scenarios/quac_scenario.py +14 -0
  148. helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
  149. helm/benchmark/scenarios/raft_scenario.py +15 -0
  150. helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
  151. helm/benchmark/scenarios/scenario.py +31 -0
  152. helm/benchmark/scenarios/seahelm_scenario.py +348 -0
  153. helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
  154. helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
  155. helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
  156. helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
  157. helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
  158. helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
  159. helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
  160. helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
  161. helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
  162. helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
  163. helm/benchmark/scenarios/situation_prompts.yaml +49 -0
  164. helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
  165. helm/benchmark/scenarios/summarization_scenario.py +37 -0
  166. helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
  167. helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
  168. helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
  169. helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
  170. helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
  171. helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
  172. helm/benchmark/scenarios/the_pile_scenario.py +13 -1
  173. helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
  174. helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
  175. helm/benchmark/scenarios/vicuna_scenario.py +21 -1
  176. helm/benchmark/scenarios/wikifact_scenario.py +20 -0
  177. helm/benchmark/scenarios/wildbench_scenario.py +18 -0
  178. helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
  179. helm/benchmark/static/schema_arabic.yaml +55 -12
  180. helm/benchmark/static/schema_long_context.yaml +17 -17
  181. helm/benchmark/static/schema_medhelm.yaml +36 -0
  182. helm/benchmark/static/schema_slp.yaml +219 -0
  183. helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
  184. helm/benchmark/static_build/assets/index-9352595e.css +1 -0
  185. helm/benchmark/static_build/index.html +2 -2
  186. helm/clients/audio_language/llama_omni/arguments.py +61 -0
  187. helm/clients/audio_language/llama_omni/constants.py +9 -0
  188. helm/clients/audio_language/llama_omni/conversation.py +213 -0
  189. helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
  190. helm/clients/audio_language/llama_omni/model/builder.py +88 -0
  191. helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
  192. helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
  193. helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
  194. helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
  195. helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
  196. helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
  197. helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
  198. helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
  199. helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
  200. helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
  201. helm/clients/audio_language/llama_omni/preprocess.py +295 -0
  202. helm/clients/audio_language/llama_omni/utils.py +202 -0
  203. helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
  204. helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
  205. helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
  206. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
  207. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
  208. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
  209. helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
  210. helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
  211. helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
  212. helm/clients/openai_client.py +31 -19
  213. helm/clients/openai_responses_client.py +27 -3
  214. helm/clients/openrouter_client.py +31 -0
  215. helm/clients/test_openrouter_client.py +69 -0
  216. helm/clients/together_client.py +48 -11
  217. helm/clients/vertexai_client.py +8 -2
  218. helm/config/model_deployments.yaml +75 -1
  219. helm/config/model_metadata.yaml +70 -2
  220. helm/config/tokenizer_configs.yaml +19 -1
  221. helm/proxy/example_queries.py +8 -8
  222. helm/proxy/server.py +2 -1
  223. helm/proxy/static/index.css +4 -0
  224. helm/proxy/static/index.js +7 -1
  225. helm/benchmark/metrics/aci_bench_metrics.py +0 -14
  226. helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
  227. helm/benchmark/metrics/dischargeme_metrics.py +0 -14
  228. helm/benchmark/metrics/med_dialog_metrics.py +0 -14
  229. helm/benchmark/metrics/medalign_metrics.py +0 -14
  230. helm/benchmark/metrics/medi_qa_metrics.py +0 -14
  231. helm/benchmark/metrics/medication_qa_metrics.py +0 -14
  232. helm/benchmark/metrics/mental_health_metrics.py +0 -14
  233. helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
  234. helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
  235. helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
  236. helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
  237. helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
  238. helm/benchmark/static_build/assets/index-b9779128.css +0 -1
  239. helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
  240. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
  241. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
  242. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
  243. {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
@@ -12,17 +12,17 @@ compares to the gold response in terms of accuracy, completeness, and clarity.
12
12
 
13
13
  The user's request will be provided in these tags:
14
14
  <user_request>
15
- {{QUESTION}}
15
+ {QUESTION}
16
16
  </user_request>
17
17
 
18
18
  The response will be provided in these tags:
19
19
  <response>
20
- {{RESPONSE}}
20
+ {RESPONSE}
21
21
  </response>
22
22
 
23
23
  A potential correct response will be provided in these tags:
24
24
  <gold_response>
25
- {{GOLD_RESPONSE}}
25
+ {GOLD_RESPONSE}
26
26
  </gold_response>
27
27
 
28
28
  Carefully analyze the <response>. For each of the following categories,
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
70
70
  "clarity": {"score", "explanation"},
71
71
  }
72
72
 
73
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
74
- "gpt": AnnotatorModelInfo(
75
- model_name="openai/gpt-4o-2024-05-13",
76
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
77
- ),
78
- "llama": AnnotatorModelInfo(
79
- model_name="meta/llama-3.3-70b-instruct",
80
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
81
- ),
82
- "claude": AnnotatorModelInfo(
83
- model_name="anthropic/claude-3-7-sonnet-20250219",
84
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
85
- ),
86
- }
87
-
88
73
 
89
74
  class MIMICBHCAnnotator(LLMAsJuryAnnotator):
90
75
  """The MIMICBHC autograder."""
91
76
 
92
- name = "mimic_bhc"
93
-
94
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
77
+ def __init__(
78
+ self,
79
+ auto_client: AutoClient,
80
+ annotator_models: Dict[str, AnnotatorModelInfo],
81
+ template_name: Optional[str] = None,
82
+ ):
95
83
  super().__init__(
84
+ name="mimic_bhc",
96
85
  auto_client=auto_client,
97
86
  prompt_template=PROMPT_TEMPLATE,
98
87
  annotation_criteria=ANNOTATION_CRITERIA,
99
- annotator_models=ANNOTATOR_MODELS,
88
+ annotator_models=annotator_models,
100
89
  )
@@ -11,17 +11,17 @@ how it compares to the gold response in terms of accuracy, completeness, and cla
11
11
 
12
12
  The user's request will be provided in these tags:
13
13
  <user_request>
14
- {{QUESTION}}
14
+ {QUESTION}
15
15
  </user_request>
16
16
 
17
17
  The response will be provided in these tags:
18
18
  <response>
19
- {{RESPONSE}}
19
+ {RESPONSE}
20
20
  </response>
21
21
 
22
22
  Some potential correct responses will be provided in these tags:
23
23
  <gold_response>
24
- {{GOLD_RESPONSE}}
24
+ {GOLD_RESPONSE}
25
25
  </gold_response>
26
26
 
27
27
  Carefully analyze the <response>.
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
70
70
  "clarity": {"score", "explanation"},
71
71
  }
72
72
 
73
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
74
- "gpt": AnnotatorModelInfo(
75
- model_name="openai/gpt-4o-2024-05-13",
76
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
77
- ),
78
- "llama": AnnotatorModelInfo(
79
- model_name="meta/llama-3.3-70b-instruct",
80
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
81
- ),
82
- "claude": AnnotatorModelInfo(
83
- model_name="anthropic/claude-3-7-sonnet-20250219",
84
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
85
- ),
86
- }
87
-
88
73
 
89
74
  class MIMICRRSAnnotator(LLMAsJuryAnnotator):
90
75
  """The MIMICRRS autograder."""
91
76
 
92
- name = "mimic_rrs"
93
-
94
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
77
+ def __init__(
78
+ self,
79
+ auto_client: AutoClient,
80
+ annotator_models: Dict[str, AnnotatorModelInfo],
81
+ template_name: Optional[str] = None,
82
+ ):
95
83
  super().__init__(
84
+ name="mimic_rrs",
96
85
  auto_client=auto_client,
97
86
  prompt_template=PROMPT_TEMPLATE,
98
87
  annotation_criteria=ANNOTATION_CRITERIA,
99
- annotator_models=ANNOTATOR_MODELS,
88
+ annotator_models=annotator_models,
100
89
  )
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import re
3
3
  from dataclasses import dataclass
4
+ from string import Template
4
5
  from typing import Dict, Optional, TypedDict, Union, Callable, Any, Set
5
6
 
6
7
  from helm.benchmark.adaptation.request_state import RequestState
@@ -113,6 +114,7 @@ class LLMAsJuryAnnotator(Annotator):
113
114
 
114
115
  def __init__(
115
116
  self,
117
+ name: str,
116
118
  auto_client: AutoClient,
117
119
  prompt_template: str,
118
120
  annotation_criteria: Dict[str, Set[str]],
@@ -128,6 +130,7 @@ class LLMAsJuryAnnotator(Annotator):
128
130
  :param annotator_models: Dictionary of models to use for annotation
129
131
  :param preprocessor: Optional function to preprocess model responses
130
132
  """
133
+ self.name = name
131
134
  self._auto_client = auto_client
132
135
  self._prompt_template = prompt_template
133
136
  self._annotation_criteria = annotation_criteria
@@ -147,32 +150,34 @@ class LLMAsJuryAnnotator(Annotator):
147
150
  def _interpolate_prompt(
148
151
  self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None
149
152
  ) -> str:
150
- """
151
- Interpolate prompt template with request state information.
152
-
153
- :param request_state: The current request state
154
- :param custom_replacements: Optional dictionary of additional replacements
155
- :return: Interpolated prompt
156
- """
157
- base_replacements = {
158
- "{{QUESTION}}": request_state.instance.input.text,
159
- "{{RESPONSE}}": (
153
+ """Interpolate prompt templates safely, supporting {{QUESTION}}-style files."""
154
+ # Build required/optional fields
155
+ replacements: Dict[str, str] = {
156
+ "QUESTION": request_state.instance.input.text,
157
+ "RESPONSE": (
160
158
  request_state.result.completions[0].text
161
159
  if request_state.result and request_state.result.completions
162
160
  else ""
163
161
  ),
164
- "{{GOLD_RESPONSE}}": request_state.instance.references[0].output.text,
162
+ # GOLD is optional; keep empty if not present
163
+ "GOLD_RESPONSE": (
164
+ request_state.instance.references[0].output.text
165
+ if getattr(request_state.instance, "references", None)
166
+ else ""
167
+ ),
165
168
  }
166
-
167
- # Allow custom replacements to override base replacements
168
169
  if custom_replacements:
169
- base_replacements.update(custom_replacements)
170
+ replacements.update(custom_replacements)
170
171
 
171
- prompt = self._prompt_template
172
- for key, value in base_replacements.items():
173
- prompt = prompt.replace(key, str(value))
172
+ tmpl_text = self._prompt_template
173
+
174
+ tmpl_text = (
175
+ tmpl_text.replace("{QUESTION}", "$QUESTION")
176
+ .replace("{RESPONSE}", "$RESPONSE")
177
+ .replace("{GOLD_RESPONSE}", "$GOLD_RESPONSE")
178
+ )
174
179
 
175
- return prompt
180
+ return Template(tmpl_text).substitute(replacements)
176
181
 
177
182
  def _validate_annotation(self, annotator_criteria: Dict[str, Any], annotator_name: str) -> bool:
178
183
  """
@@ -11,17 +11,17 @@ and compares to the gold response in terms of accuracy, completeness, and clarit
11
11
 
12
12
  The user's request will be provided in these tags:
13
13
  <user_request>
14
- {{QUESTION}}
14
+ {QUESTION}
15
15
  </user_request>
16
16
 
17
17
  The response will be provided in these tags:
18
18
  <response>
19
- {{RESPONSE}}
19
+ {RESPONSE}
20
20
  </response>
21
21
 
22
22
  Some potential correct responses will be provided in these tags:
23
23
  <gold_response>
24
- {{GOLD_RESPONSE}}
24
+ {GOLD_RESPONSE}
25
25
  </gold_response>
26
26
 
27
27
  Carefully analyze the <response>.
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
68
68
  "clarity": {"score", "explanation"},
69
69
  }
70
70
 
71
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
72
- "gpt": AnnotatorModelInfo(
73
- model_name="openai/gpt-4o-2024-05-13",
74
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
75
- ),
76
- "llama": AnnotatorModelInfo(
77
- model_name="meta/llama-3.3-70b-instruct",
78
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
79
- ),
80
- "claude": AnnotatorModelInfo(
81
- model_name="anthropic/claude-3-7-sonnet-20250219",
82
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
83
- ),
84
- }
85
-
86
71
 
87
72
  class MTSamplesProceduresAnnotator(LLMAsJuryAnnotator):
88
73
  """The MTSamplesProcedures autograder."""
89
74
 
90
- name = "mtsamples_procedures"
91
-
92
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
75
+ def __init__(
76
+ self,
77
+ auto_client: AutoClient,
78
+ annotator_models: Dict[str, AnnotatorModelInfo],
79
+ template_name: Optional[str] = None,
80
+ ):
93
81
  super().__init__(
82
+ name="mtsamples_procedures",
94
83
  auto_client=auto_client,
95
84
  prompt_template=PROMPT_TEMPLATE,
96
85
  annotation_criteria=ANNOTATION_CRITERIA,
97
- annotator_models=ANNOTATOR_MODELS,
86
+ annotator_models=annotator_models,
98
87
  )
@@ -13,17 +13,17 @@ and aligns with the gold standard response in accuracy, completeness, and clarit
13
13
 
14
14
  The patient's information will be provided in these tags:
15
15
  <patient_information>
16
- {{QUESTION}}
16
+ {QUESTION}
17
17
  </patient_information>
18
18
 
19
19
  The proposed treatment plan will be provided in these tags:
20
20
  <response>
21
- {{RESPONSE}}
21
+ {RESPONSE}
22
22
  </response>
23
23
 
24
24
  The gold standard treatment plan will be provided in these tags:
25
25
  <gold_response>
26
- {{GOLD_RESPONSE}}
26
+ {GOLD_RESPONSE}
27
27
  </gold_response>
28
28
 
29
29
  Carefully review the <response> based on the <patient_information> and compare it to the <gold_response> when needed.
@@ -71,31 +71,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
71
71
  "clarity": {"score", "explanation"},
72
72
  }
73
73
 
74
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
75
- "gpt": AnnotatorModelInfo(
76
- model_name="openai/gpt-4o-2024-05-13",
77
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
78
- ),
79
- "llama": AnnotatorModelInfo(
80
- model_name="meta/llama-3.3-70b-instruct",
81
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
82
- ),
83
- "claude": AnnotatorModelInfo(
84
- model_name="anthropic/claude-3-7-sonnet-20250219",
85
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
86
- ),
87
- }
88
-
89
74
 
90
75
  class MTSamplesReplicateAnnotator(LLMAsJuryAnnotator):
91
76
  """The MTSamplesReplicate autograder."""
92
77
 
93
- name = "mtsamples_replicate"
94
-
95
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
78
+ def __init__(
79
+ self,
80
+ auto_client: AutoClient,
81
+ annotator_models: Dict[str, AnnotatorModelInfo],
82
+ template_name: Optional[str] = None,
83
+ ):
96
84
  super().__init__(
85
+ name="mtsamples_replicate",
97
86
  auto_client=auto_client,
98
87
  prompt_template=PROMPT_TEMPLATE,
99
88
  annotation_criteria=ANNOTATION_CRITERIA,
100
- annotator_models=ANNOTATOR_MODELS,
89
+ annotator_models=annotator_models,
101
90
  )
@@ -11,17 +11,17 @@ procedure, and how it compares to the gold response in terms of accuracy, comple
11
11
 
12
12
  The case will be provided in these tags:
13
13
  <case_details>
14
- {{QUESTION}}
14
+ {QUESTION}
15
15
  </case_details>
16
16
 
17
17
  The response will be provided in these tags:
18
18
  <response>
19
- {{RESPONSE}}
19
+ {RESPONSE}
20
20
  </response>
21
21
 
22
22
  The reference response will be provided in these tags:
23
23
  <gold_response>
24
- {{GOLD_RESPONSE}}
24
+ {GOLD_RESPONSE}
25
25
  </gold_response>
26
26
 
27
27
  Carefully review the <response> and compare it to the <gold_response> when needed.
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
68
68
  "clarity": {"score", "explanation"},
69
69
  }
70
70
 
71
- ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
72
- "gpt": AnnotatorModelInfo(
73
- model_name="openai/gpt-4o-2024-05-13",
74
- model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
75
- ),
76
- "llama": AnnotatorModelInfo(
77
- model_name="meta/llama-3.3-70b-instruct",
78
- model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
79
- ),
80
- "claude": AnnotatorModelInfo(
81
- model_name="anthropic/claude-3-7-sonnet-20250219",
82
- model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
83
- ),
84
- }
85
-
86
71
 
87
72
  class StarrPatientInstructionsAnnotator(LLMAsJuryAnnotator):
88
73
  """The StarrPatientInstructions autograder."""
89
74
 
90
- name = "starr_patient_instructions"
91
-
92
- def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
75
+ def __init__(
76
+ self,
77
+ auto_client: AutoClient,
78
+ annotator_models: Dict[str, AnnotatorModelInfo],
79
+ template_name: Optional[str] = None,
80
+ ):
93
81
  super().__init__(
82
+ name="starr_patient_instructions",
94
83
  auto_client=auto_client,
95
84
  prompt_template=PROMPT_TEMPLATE,
96
85
  annotation_criteria=ANNOTATION_CRITERIA,
97
- annotator_models=ANNOTATOR_MODELS,
86
+ annotator_models=annotator_models,
98
87
  )