PyPI - crfm-helm - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl - Mend

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (243) hide show

{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
helm/benchmark/adaptation/adapter_spec.py +5 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +2 -2
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/presentation/schema.py +5 -22
helm/benchmark/presentation/summarize.py +180 -11
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +134 -16
helm/benchmark/run_specs/bluex_run_specs.py +1 -1
helm/benchmark/run_specs/classic_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +2 -2
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
helm/benchmark/scenarios/aratrust_scenario.py +19 -0
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bluex_scenario.py +6 -2
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +479 -0
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/commonsense_scenario.py +26 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +15 -0
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +20 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +26 -0
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +14 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +15 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +348 -0
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
helm/benchmark/static/schema_arabic.yaml +55 -12
helm/benchmark/static/schema_long_context.yaml +17 -17
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
helm/benchmark/static_build/assets/index-9352595e.css +1 -0
helm/benchmark/static_build/index.html +2 -2
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/openai_client.py +31 -19
helm/clients/openai_responses_client.py +27 -3
helm/clients/openrouter_client.py +31 -0
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +48 -11
helm/clients/vertexai_client.py +8 -2
helm/config/model_deployments.yaml +75 -1
helm/config/model_metadata.yaml +70 -2
helm/config/tokenizer_configs.yaml +19 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0

helm/benchmark/adaptation/adapter_spec.py CHANGED Viewed

@@ -63,6 +63,11 @@ class AdapterSpec:
     reference_prefix: str = "A. "
     """The string that is included before each reference (for multiple-choice questions)."""
+    # Set hash=False to make `AdapterSpec` hashable
+    reference_prefix_characters: Optional[List[str]] = field(default=None, hash=False)
+    """The characters that are used to identify choices for multiple-choice questions e.g. ["A", "B", "C", "D"].
+    If unset, defaults to the sequence of ascending characters starting from the first character of reference_prefix."""
     reference_suffix: str = "\n"
     """The string that is included after each reference (for multiple-choice questions)."""

helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py CHANGED Viewed

@@ -18,12 +18,20 @@ class MultipleChoiceJointMultimodalAdapter(InContextLearningMultimodalAdapter, A
     learning for multimodal models.
     """
-    @staticmethod
-    def get_reference_prefix(prefix: str, i: int) -> str:
+    def get_prefix_char(self, prefix: str) -> str:
+        return [char for char in prefix if char.isalnum()][0]
+    def get_reference_prefix(self, prefix: str, i: int) -> str:
         """
         Example: prefix = "\nA. ", i = 2, return "\nC. "
         """
-        return prefix.replace("A", chr(ord("A") + i))
+        old_prefix_char = self.get_prefix_char(prefix)
+        new_prefix_char = (
+            self.adapter_spec.reference_prefix_characters[i]
+            if self.adapter_spec.reference_prefix_characters
+            else chr(ord(old_prefix_char) + i)
+        )
+        return prefix.replace(old_prefix_char, new_prefix_char)
     def generate_requests(
         self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]

helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py CHANGED Viewed

@@ -38,22 +38,25 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
         <input_prefix><input><reference_prefixes[index]><reference><output_prefix><output>
     """
-    @staticmethod
-    def get_prefix_char(prefix: str) -> str:
+    def get_prefix_char(self, prefix: str) -> str:
         return [char for char in prefix if char.isalnum()][0]
-    @staticmethod
-    def get_reference_prefix(prefix: str, i: int) -> str:
+    def get_reference_prefix(self, prefix: str, i: int) -> str:
         """
         Example: prefix = "\nA. ", i = 2, return "\nC. "
         """
-        prefix_char = MultipleChoiceJointAdapter.get_prefix_char(prefix)
-        return prefix.replace(prefix_char, chr(ord(prefix_char) + i))
+        old_prefix_char = self.get_prefix_char(prefix)
+        new_prefix_char = (
+            self.adapter_spec.reference_prefix_characters[i]
+            if self.adapter_spec.reference_prefix_characters
+            else chr(ord(old_prefix_char) + i)
+        )
+        return prefix.replace(old_prefix_char, new_prefix_char)
     def generate_requests(
         self, eval_instance: Instance, train_trial_index: int, training_instances: List[Instance]
     ) -> List[RequestState]:
-        prefix_char = MultipleChoiceJointAdapter.get_prefix_char(self.adapter_spec.reference_prefix)
+        prefix_char = self.get_prefix_char(self.adapter_spec.reference_prefix)
         prompt = self.construct_prompt(training_instances, eval_instance, include_output=False, reference_index=None)
         output_mapping: Dict[str, str] = dict(
             (self.get_reference_prefix(prefix_char, reference_index), reference.output.text)
@@ -91,7 +94,7 @@ class MultipleChoiceJointAdapter(InContextLearningAdapter):
         # Include the references
         delimiter = ", "
         no_correct_references = "n/a"
-        prefix_char = MultipleChoiceJointAdapter.get_prefix_char(self.adapter_spec.reference_prefix)
+        prefix_char = self.get_prefix_char(self.adapter_spec.reference_prefix)
         output = no_correct_references
         for reference_index, reference in enumerate(instance.references):
             prefix = self.get_reference_prefix(self.adapter_spec.reference_prefix, reference_index)

helm/benchmark/annotation/aci_bench_annotator.py CHANGED Viewed

@@ -10,15 +10,15 @@ Your goal is to assess how well the note captures the clinical information from
 compare it to the reference note (gold standard) in terms of accruacy, completeness and clarity.
 The conversation will be provided in these tags:
 <conversation>
-{{QUESTION}}
+{QUESTION}
 </conversation>
 The generated note will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The reference note will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully review the <response> based on the <conversation> and compare it to the <gold_response> when needed.
@@ -65,31 +65,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class ACIBenchAnnotator(LLMAsJuryAnnotator):
     """The ACIBench autograder."""
-    name = "aci_bench"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="aci_bench",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/alrage_annotator.py ADDED Viewed

@@ -0,0 +1,90 @@
+from typing import Any
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.hierarchical_logger import hwarn
+from helm.common.request import Request
+class ALRAGEAnnotator(Annotator):
+    """The ALRAGEAnnotator autograder."""
+    name = "alrage"
+    _ANNOTATOR_MODEL = "openai/gpt-4o-2024-11-20"
+    def __init__(self, auto_client: AutoClient):
+        self._auto_client = auto_client
+    def _apply_annotator_template(self, question: str, answer: str, gold: str) -> Any:
+        return [
+            {
+                "role": "system",
+                "content": """أنت مقيّم محايد خبير باللغة العربية. يجب عليك:
+        1. تقييم دقة الإجابة مقارنة بالإجابة الصحيحة
+        2. التحقق من أن الإجابة مدعومة بالسياق المقدم
+        3. تقييم جودة وشمولية الإجابة
+        مهم جداً: يجب أن يكون ردك رقماً فقط من 0 إلى 10. لا تضف أي نص أو تفسير.""",
+            },
+            {
+                "role": "user",
+                "content": f"""السؤال: {question}
+        الإجابة المقدمة: {answer}
+        الإجابة الصحيحة: {gold}
+        أعط تقييماً من 0 إلى 10:
+        0-2: إجابة خاطئة تماماً
+        3-4: إجابة جزئية مع أخطاء
+        5-6: إجابة متوسطة
+        7-8: إجابة جيدة
+        9-10: إجابة ممتازة
+        اكتب رقماً فقط من 0 إلى 10 بدون أي نص إضافي:""",
+            },
+        ]
+    def _parse_annotator_response(self, response: str) -> float:
+        """Process the judge's response to extract the score"""
+        try:
+            # Extract the first number from the response content
+            score = float(next(num for num in response.split() if num.replace(".", "", 1).isdigit()))
+            return min(max(score / 10.0, 0.0), 1.0)
+        except Exception as e:
+            hwarn(f"Error while processing judge response: {e}")
+            return 0.0
+    def annotate(self, request_state: RequestState) -> Any:
+        question = request_state.instance.input.text
+        assert request_state.result
+        assert len(request_state.result.completions) == 1
+        answer = request_state.result.completions[0].text
+        assert len(request_state.instance.all_correct_references) == 1
+        gold = request_state.instance.all_correct_references[0].output.text
+        messages = self._apply_annotator_template(question, answer, gold)
+        judge_request = Request(
+            model=self._ANNOTATOR_MODEL,
+            model_deployment=self._ANNOTATOR_MODEL,
+            messages=messages,
+            temperature=0.0,
+            max_tokens=2000,
+        )
+        judge_response = self._auto_client.make_request(judge_request)
+        if not judge_response.success:
+            raise Exception(
+                "ALRAGEAnnotator got an error response from " f"{self._ANNOTATOR_MODEL}: {judge_response.error}"
+            )
+        assert len(judge_response.completions) == 1
+        prompt = messages[-1]["content"]
+        response = judge_response.completions[0].text
+        score = self._parse_annotator_response(response)
+        return {
+            "prompt": prompt,
+            "response": response,
+            "score": score,
+        }

helm/benchmark/annotation/chw_care_plan_annotator.py CHANGED Viewed

@@ -11,12 +11,12 @@ and follows provided instructions in terms of accuracy, structure, and clarity.
 The user's request will be provided in these tags:
 <user_request>
-{{QUESTION}}
+{QUESTION}
 </user_request>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 Carefully analyze the <response>.
@@ -63,31 +63,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class CHWCarePlanAnnotator(LLMAsJuryAnnotator):
     """The CHWCarePlan autograder."""
-    name = "chw_care_plan"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="chw_care_plan",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/dischargeme_annotator.py CHANGED Viewed

@@ -15,19 +15,19 @@ gold response in terms of accuracy, completeness, and clarity.
 The target task of either generating a discharge instruction or brief hospital course along with
 the patient discharge text and radiology report will be provided in these tags:
 <patient_information>
-{{QUESTION}}
+{QUESTION}
 </patient_information>
 The document will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The gold standard target document (either discharge instructions or a brief hospital course)
 will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully analyze the <response> based on the <patient_information> and compare
@@ -77,31 +77,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class DischargeMeAnnotator(LLMAsJuryAnnotator):
     """The DischargeMe autograder."""
-    name = "dischargeme"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="dischargeme",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/med_dialog_annotator.py CHANGED Viewed

@@ -11,17 +11,17 @@ and how it compares to the gold response in terms of accuracy, completeness, and
 The patient-doctor conversation will be provided in these tags:
 <conversation>
-{{QUESTION}}
+{QUESTION}
 </conversation>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The reference response will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully review the <response> and compare it to the <gold_response> when needed.
@@ -69,31 +69,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MedDialogAnnotator(LLMAsJuryAnnotator):
     """The MedDialog autograder."""
-    name = "med_dialog"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="med_dialog",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/medalign_annotator.py CHANGED Viewed

@@ -12,17 +12,17 @@ and aligns with the gold response in terms of accuracy, completeness, and clarit
 The instruction and EHR pair will be provided in these tags:
 <user_request>
-{{QUESTION}}
+{QUESTION}
 </user_request>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The gold response (reference answer) will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully review the <response> based on the <user_request> and compare it to
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MedalignAnnotator(LLMAsJuryAnnotator):
     """The Medalign autograder."""
-    name = "medalign"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="medalign",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/medi_qa_annotator.py CHANGED Viewed

@@ -11,17 +11,17 @@ and how it compares to the gold response in terms of accuracy, completeness, and
 The question will be provided in these tags:
 <question>
-{{QUESTION}}
+{QUESTION}
 </question>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The reference answer will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully analyze the <response> compared to the <gold_response> and the original <question>.
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MediQAAnnotator(LLMAsJuryAnnotator):
     """The MediQA autograder."""
-    name = "medi_qa"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="medi_qa",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/medication_qa_annotator.py CHANGED Viewed

@@ -11,17 +11,17 @@ and how it compares to the gold response in terms of accuracy, completeness, and
 The question provided in these tags:
 <medication_question>
-{{QUESTION}}
+{QUESTION}
 </medication_question>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The reference response will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully review the <response> and compare it to the <gold_response> when needed.
@@ -67,31 +67,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MedicationQAAnnotator(LLMAsJuryAnnotator):
     """The MedicationQA autograder."""
-    name = "medication_qa"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="medication_qa",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

helm/benchmark/annotation/mental_health_annotator.py CHANGED Viewed

@@ -11,17 +11,17 @@ and how it compares to the gold response in terms of accuracy, completeness, and
 The conversation history and question will be provided in these tags:
 <conversation>
-{{QUESTION}}
+{QUESTION}
 </conversation>
 The response will be provided in these tags:
 <response>
-{{RESPONSE}}
+{RESPONSE}
 </response>
 The reference response will be provided in these tags:
 <gold_response>
-{{GOLD_RESPONSE}}
+{GOLD_RESPONSE}
 </gold_response>
 Carefully analyze the <response> compared to the <gold_response> and the original <conversation>.
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "clarity": {"score", "explanation"},
 }
-ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
-    "gpt": AnnotatorModelInfo(
-        model_name="openai/gpt-4o-2024-05-13",
-        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
-    ),
-    "llama": AnnotatorModelInfo(
-        model_name="meta/llama-3.3-70b-instruct",
-        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
-    ),
-    "claude": AnnotatorModelInfo(
-        model_name="anthropic/claude-3-7-sonnet-20250219",
-        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
-    ),
-}
 class MentalHealthAnnotator(LLMAsJuryAnnotator):
     """The MentalHealth autograder."""
-    name = "mental_health"
-    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+    def __init__(
+        self,
+        auto_client: AutoClient,
+        annotator_models: Dict[str, AnnotatorModelInfo],
+        template_name: Optional[str] = None,
+    ):
         super().__init__(
+            name="mental_health",
             auto_client=auto_client,
             prompt_template=PROMPT_TEMPLATE,
             annotation_criteria=ANNOTATION_CRITERIA,
-            annotator_models=ANNOTATOR_MODELS,
+            annotator_models=annotator_models,
         )

crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.7py3-none-any.whl → 0.5.8py3-none-any.whl