PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show

{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +2 -2
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/presentation/schema.py +5 -22
helm/benchmark/presentation/summarize.py +180 -11
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +134 -16
helm/benchmark/run_specs/bluex_run_specs.py +1 -1
helm/benchmark/run_specs/classic_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +2 -2
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
helm/benchmark/scenarios/aratrust_scenario.py +19 -0
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +6 -2
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +479 -0
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +26 -0
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +348 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/static/schema_arabic.yaml +55 -12
helm/benchmark/static/schema_long_context.yaml +17 -17
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +31 -19
helm/clients/openai_responses_client.py +27 -3
helm/clients/openrouter_client.py +31 -0
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -11
helm/clients/vertexai_client.py +8 -2
helm/config/model_deployments.yaml +75 -1
helm/config/model_metadata.yaml +70 -2
helm/config/tokenizer_configs.yaml +19 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/annotation/mimic_bhc_annotator.py CHANGED Viewed

@@ -12,17 +12,17 @@ compares to the gold response in terms of accuracy, completeness, and clarity.
 The user's request will be provided in these tags:
 <user_request>
-{{QUESTION}}
+{QUESTION}
 </user_request>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 A potential correct response will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully analyze the <response>. For each of the following categories,
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MIMICBHCAnnotator(LLMAsJuryAnnotator):
     """The MIMICBHC autograder."""
-    name = "mimic_bhc"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="mimic_bhc",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/mimic_rrs_annotator.py CHANGED Viewed

@@ -11,17 +11,17 @@ how it compares to the gold response in terms of accuracy, completeness, and cla
 The user's request will be provided in these tags:
 <user_request>
-{{QUESTION}}
+{QUESTION}
 </user_request>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 Some potential correct responses will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully analyze the <response>.
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MIMICRRSAnnotator(LLMAsJuryAnnotator):
     """The MIMICRRS autograder."""
-    name = "mimic_rrs"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="mimic_rrs",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/model_as_judge.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import re
 from dataclasses import dataclass
+from string import Template
 from typing import Dict, Optional, TypedDict, Union, Callable, Any, Set
 from helm.benchmark.adaptation.request_state import RequestState
@@ -113,6 +114,7 @@ class LLMAsJuryAnnotator(Annotator):
     def __init__(
         self,
+        name: str,
         auto_client: AutoClient,
         prompt_template: str,
         annotation_criteria: Dict[str, Set[str]],
@@ -128,6 +130,7 @@ class LLMAsJuryAnnotator(Annotator):
         :param annotator_models: Dictionary of models to use for annotation
         :param preprocessor: Optional function to preprocess model responses
         """
+        self.name = name
         self._auto_client = auto_client
         self._prompt_template = prompt_template
         self._annotation_criteria = annotation_criteria
@@ -147,32 +150,34 @@ class LLMAsJuryAnnotator(Annotator):
     def _interpolate_prompt(
         self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None
     ) -> str:
-        """
-        Interpolate prompt template with request state information.
-        :param request_state: The current request state
-        :param custom_replacements: Optional dictionary of additional replacements
-        :return: Interpolated prompt
-        """
-        base_replacements = {
-            "{{QUESTION}}": request_state.instance.input.text,
-            "{{RESPONSE}}": (
+        """Interpolate prompt templates safely, supporting {{QUESTION}}-style files."""
+        # Build required/optional fields
+        replacements: Dict[str, str] = {
+            "QUESTION": request_state.instance.input.text,
+            "RESPONSE": (
                 request_state.result.completions[0].text
                 if request_state.result and request_state.result.completions
                 else ""
             ),
-            "{{GOLD_RESPONSE}}": request_state.instance.references[0].output.text,
+            # GOLD is optional; keep empty if not present
+            "GOLD_RESPONSE": (
+                request_state.instance.references[0].output.text
+                if getattr(request_state.instance, "references", None)
+                else ""
+            ),
         }
-        # Allow custom replacements to override base replacements
         if custom_replacements:
-            base_replacements.update(custom_replacements)
+            replacements.update(custom_replacements)
-        prompt = self._prompt_template
-        for key, value in base_replacements.items():
-            prompt = prompt.replace(key, str(value))
+        tmpl_text = self._prompt_template
+        tmpl_text = (
+            tmpl_text.replace("{QUESTION}", "$QUESTION")
+            .replace("{RESPONSE}", "$RESPONSE")
+            .replace("{GOLD_RESPONSE}", "$GOLD_RESPONSE")
+        )
-        return prompt
+        return Template(tmpl_text).substitute(replacements)
     def _validate_annotation(self, annotator_criteria: Dict[str, Any], annotator_name: str) -> bool:
         """

helm/benchmark/annotation/mtsamples_procedures_annotator.py CHANGED Viewed

@@ -11,17 +11,17 @@ and compares to the gold response in terms of accuracy, completeness, and clarit
 The user's request will be provided in these tags:
 <user_request>
-{{QUESTION}}
+{QUESTION}
 </user_request>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 Some potential correct responses will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully analyze the <response>.
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MTSamplesProceduresAnnotator(LLMAsJuryAnnotator):
     """The MTSamplesProcedures autograder."""
-    name = "mtsamples_procedures"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="mtsamples_procedures",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/mtsamples_replicate_annotator.py CHANGED Viewed

@@ -13,17 +13,17 @@ and aligns with the gold standard response in accuracy, completeness, and clarit
 The patient's information will be provided in these tags:
 <patient_information>
-{{QUESTION}}
+{QUESTION}
 </patient_information>
 The proposed treatment plan will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The gold standard treatment plan will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully review the <response> based on the <patient_information> and compare it to the <gold_response> when needed.
@@ -71,31 +71,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MTSamplesReplicateAnnotator(LLMAsJuryAnnotator):
     """The MTSamplesReplicate autograder."""
-    name = "mtsamples_replicate"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="mtsamples_replicate",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/starr_patient_instructions_annotator.py CHANGED Viewed

@@ -11,17 +11,17 @@ procedure, and how it compares to the gold response in terms of accuracy, comple
 The case will be provided in these tags:
 <case_details>
-{{QUESTION}}
+{QUESTION}
 </case_details>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The reference response will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully review the <response> and compare it to the <gold_response> when needed.
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class StarrPatientInstructionsAnnotator(LLMAsJuryAnnotator):
     """The StarrPatientInstructions autograder."""
-    name = "starr_patient_instructions"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="starr_patient_instructions",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl