crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -12,17 +12,17 @@ compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
|
12
12
|
|
|
13
13
|
The user's request will be provided in these tags:
|
|
14
14
|
<user_request>
|
|
15
|
-
{
|
|
15
|
+
{QUESTION}
|
|
16
16
|
</user_request>
|
|
17
17
|
|
|
18
18
|
The response will be provided in these tags:
|
|
19
19
|
<response>
|
|
20
|
-
{
|
|
20
|
+
{RESPONSE}
|
|
21
21
|
</response>
|
|
22
22
|
|
|
23
23
|
A potential correct response will be provided in these tags:
|
|
24
24
|
<gold_response>
|
|
25
|
-
{
|
|
25
|
+
{GOLD_RESPONSE}
|
|
26
26
|
</gold_response>
|
|
27
27
|
|
|
28
28
|
Carefully analyze the <response>. For each of the following categories,
|
|
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
|
70
70
|
"clarity": {"score", "explanation"},
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
74
|
-
"gpt": AnnotatorModelInfo(
|
|
75
|
-
model_name="openai/gpt-4o-2024-05-13",
|
|
76
|
-
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
77
|
-
),
|
|
78
|
-
"llama": AnnotatorModelInfo(
|
|
79
|
-
model_name="meta/llama-3.3-70b-instruct",
|
|
80
|
-
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
81
|
-
),
|
|
82
|
-
"claude": AnnotatorModelInfo(
|
|
83
|
-
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
84
|
-
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
85
|
-
),
|
|
86
|
-
}
|
|
87
|
-
|
|
88
73
|
|
|
89
74
|
class MIMICBHCAnnotator(LLMAsJuryAnnotator):
|
|
90
75
|
"""The MIMICBHC autograder."""
|
|
91
76
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
auto_client: AutoClient,
|
|
80
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
81
|
+
template_name: Optional[str] = None,
|
|
82
|
+
):
|
|
95
83
|
super().__init__(
|
|
84
|
+
name="mimic_bhc",
|
|
96
85
|
auto_client=auto_client,
|
|
97
86
|
prompt_template=PROMPT_TEMPLATE,
|
|
98
87
|
annotation_criteria=ANNOTATION_CRITERIA,
|
|
99
|
-
annotator_models=
|
|
88
|
+
annotator_models=annotator_models,
|
|
100
89
|
)
|
|
@@ -11,17 +11,17 @@ how it compares to the gold response in terms of accuracy, completeness, and cla
|
|
|
11
11
|
|
|
12
12
|
The user's request will be provided in these tags:
|
|
13
13
|
<user_request>
|
|
14
|
-
{
|
|
14
|
+
{QUESTION}
|
|
15
15
|
</user_request>
|
|
16
16
|
|
|
17
17
|
The response will be provided in these tags:
|
|
18
18
|
<response>
|
|
19
|
-
{
|
|
19
|
+
{RESPONSE}
|
|
20
20
|
</response>
|
|
21
21
|
|
|
22
22
|
Some potential correct responses will be provided in these tags:
|
|
23
23
|
<gold_response>
|
|
24
|
-
{
|
|
24
|
+
{GOLD_RESPONSE}
|
|
25
25
|
</gold_response>
|
|
26
26
|
|
|
27
27
|
Carefully analyze the <response>.
|
|
@@ -70,31 +70,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
|
70
70
|
"clarity": {"score", "explanation"},
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
74
|
-
"gpt": AnnotatorModelInfo(
|
|
75
|
-
model_name="openai/gpt-4o-2024-05-13",
|
|
76
|
-
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
77
|
-
),
|
|
78
|
-
"llama": AnnotatorModelInfo(
|
|
79
|
-
model_name="meta/llama-3.3-70b-instruct",
|
|
80
|
-
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
81
|
-
),
|
|
82
|
-
"claude": AnnotatorModelInfo(
|
|
83
|
-
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
84
|
-
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
85
|
-
),
|
|
86
|
-
}
|
|
87
|
-
|
|
88
73
|
|
|
89
74
|
class MIMICRRSAnnotator(LLMAsJuryAnnotator):
|
|
90
75
|
"""The MIMICRRS autograder."""
|
|
91
76
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
auto_client: AutoClient,
|
|
80
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
81
|
+
template_name: Optional[str] = None,
|
|
82
|
+
):
|
|
95
83
|
super().__init__(
|
|
84
|
+
name="mimic_rrs",
|
|
96
85
|
auto_client=auto_client,
|
|
97
86
|
prompt_template=PROMPT_TEMPLATE,
|
|
98
87
|
annotation_criteria=ANNOTATION_CRITERIA,
|
|
99
|
-
annotator_models=
|
|
88
|
+
annotator_models=annotator_models,
|
|
100
89
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from string import Template
|
|
4
5
|
from typing import Dict, Optional, TypedDict, Union, Callable, Any, Set
|
|
5
6
|
|
|
6
7
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
@@ -113,6 +114,7 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
113
114
|
|
|
114
115
|
def __init__(
|
|
115
116
|
self,
|
|
117
|
+
name: str,
|
|
116
118
|
auto_client: AutoClient,
|
|
117
119
|
prompt_template: str,
|
|
118
120
|
annotation_criteria: Dict[str, Set[str]],
|
|
@@ -128,6 +130,7 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
128
130
|
:param annotator_models: Dictionary of models to use for annotation
|
|
129
131
|
:param preprocessor: Optional function to preprocess model responses
|
|
130
132
|
"""
|
|
133
|
+
self.name = name
|
|
131
134
|
self._auto_client = auto_client
|
|
132
135
|
self._prompt_template = prompt_template
|
|
133
136
|
self._annotation_criteria = annotation_criteria
|
|
@@ -147,32 +150,34 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
147
150
|
def _interpolate_prompt(
|
|
148
151
|
self, request_state: RequestState, custom_replacements: Optional[Dict[str, str]] = None
|
|
149
152
|
) -> str:
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
:return: Interpolated prompt
|
|
156
|
-
"""
|
|
157
|
-
base_replacements = {
|
|
158
|
-
"{{QUESTION}}": request_state.instance.input.text,
|
|
159
|
-
"{{RESPONSE}}": (
|
|
153
|
+
"""Interpolate prompt templates safely, supporting {{QUESTION}}-style files."""
|
|
154
|
+
# Build required/optional fields
|
|
155
|
+
replacements: Dict[str, str] = {
|
|
156
|
+
"QUESTION": request_state.instance.input.text,
|
|
157
|
+
"RESPONSE": (
|
|
160
158
|
request_state.result.completions[0].text
|
|
161
159
|
if request_state.result and request_state.result.completions
|
|
162
160
|
else ""
|
|
163
161
|
),
|
|
164
|
-
|
|
162
|
+
# GOLD is optional; keep empty if not present
|
|
163
|
+
"GOLD_RESPONSE": (
|
|
164
|
+
request_state.instance.references[0].output.text
|
|
165
|
+
if getattr(request_state.instance, "references", None)
|
|
166
|
+
else ""
|
|
167
|
+
),
|
|
165
168
|
}
|
|
166
|
-
|
|
167
|
-
# Allow custom replacements to override base replacements
|
|
168
169
|
if custom_replacements:
|
|
169
|
-
|
|
170
|
+
replacements.update(custom_replacements)
|
|
170
171
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
172
|
+
tmpl_text = self._prompt_template
|
|
173
|
+
|
|
174
|
+
tmpl_text = (
|
|
175
|
+
tmpl_text.replace("{QUESTION}", "$QUESTION")
|
|
176
|
+
.replace("{RESPONSE}", "$RESPONSE")
|
|
177
|
+
.replace("{GOLD_RESPONSE}", "$GOLD_RESPONSE")
|
|
178
|
+
)
|
|
174
179
|
|
|
175
|
-
return
|
|
180
|
+
return Template(tmpl_text).substitute(replacements)
|
|
176
181
|
|
|
177
182
|
def _validate_annotation(self, annotator_criteria: Dict[str, Any], annotator_name: str) -> bool:
|
|
178
183
|
"""
|
|
@@ -11,17 +11,17 @@ and compares to the gold response in terms of accuracy, completeness, and clarit
|
|
|
11
11
|
|
|
12
12
|
The user's request will be provided in these tags:
|
|
13
13
|
<user_request>
|
|
14
|
-
{
|
|
14
|
+
{QUESTION}
|
|
15
15
|
</user_request>
|
|
16
16
|
|
|
17
17
|
The response will be provided in these tags:
|
|
18
18
|
<response>
|
|
19
|
-
{
|
|
19
|
+
{RESPONSE}
|
|
20
20
|
</response>
|
|
21
21
|
|
|
22
22
|
Some potential correct responses will be provided in these tags:
|
|
23
23
|
<gold_response>
|
|
24
|
-
{
|
|
24
|
+
{GOLD_RESPONSE}
|
|
25
25
|
</gold_response>
|
|
26
26
|
|
|
27
27
|
Carefully analyze the <response>.
|
|
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
|
68
68
|
"clarity": {"score", "explanation"},
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
-
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
72
|
-
"gpt": AnnotatorModelInfo(
|
|
73
|
-
model_name="openai/gpt-4o-2024-05-13",
|
|
74
|
-
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
75
|
-
),
|
|
76
|
-
"llama": AnnotatorModelInfo(
|
|
77
|
-
model_name="meta/llama-3.3-70b-instruct",
|
|
78
|
-
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
79
|
-
),
|
|
80
|
-
"claude": AnnotatorModelInfo(
|
|
81
|
-
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
82
|
-
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
83
|
-
),
|
|
84
|
-
}
|
|
85
|
-
|
|
86
71
|
|
|
87
72
|
class MTSamplesProceduresAnnotator(LLMAsJuryAnnotator):
|
|
88
73
|
"""The MTSamplesProcedures autograder."""
|
|
89
74
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
auto_client: AutoClient,
|
|
78
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
79
|
+
template_name: Optional[str] = None,
|
|
80
|
+
):
|
|
93
81
|
super().__init__(
|
|
82
|
+
name="mtsamples_procedures",
|
|
94
83
|
auto_client=auto_client,
|
|
95
84
|
prompt_template=PROMPT_TEMPLATE,
|
|
96
85
|
annotation_criteria=ANNOTATION_CRITERIA,
|
|
97
|
-
annotator_models=
|
|
86
|
+
annotator_models=annotator_models,
|
|
98
87
|
)
|
|
@@ -13,17 +13,17 @@ and aligns with the gold standard response in accuracy, completeness, and clarit
|
|
|
13
13
|
|
|
14
14
|
The patient's information will be provided in these tags:
|
|
15
15
|
<patient_information>
|
|
16
|
-
{
|
|
16
|
+
{QUESTION}
|
|
17
17
|
</patient_information>
|
|
18
18
|
|
|
19
19
|
The proposed treatment plan will be provided in these tags:
|
|
20
20
|
<response>
|
|
21
|
-
{
|
|
21
|
+
{RESPONSE}
|
|
22
22
|
</response>
|
|
23
23
|
|
|
24
24
|
The gold standard treatment plan will be provided in these tags:
|
|
25
25
|
<gold_response>
|
|
26
|
-
{
|
|
26
|
+
{GOLD_RESPONSE}
|
|
27
27
|
</gold_response>
|
|
28
28
|
|
|
29
29
|
Carefully review the <response> based on the <patient_information> and compare it to the <gold_response> when needed.
|
|
@@ -71,31 +71,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
|
71
71
|
"clarity": {"score", "explanation"},
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
-
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
75
|
-
"gpt": AnnotatorModelInfo(
|
|
76
|
-
model_name="openai/gpt-4o-2024-05-13",
|
|
77
|
-
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
78
|
-
),
|
|
79
|
-
"llama": AnnotatorModelInfo(
|
|
80
|
-
model_name="meta/llama-3.3-70b-instruct",
|
|
81
|
-
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
82
|
-
),
|
|
83
|
-
"claude": AnnotatorModelInfo(
|
|
84
|
-
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
85
|
-
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
86
|
-
),
|
|
87
|
-
}
|
|
88
|
-
|
|
89
74
|
|
|
90
75
|
class MTSamplesReplicateAnnotator(LLMAsJuryAnnotator):
|
|
91
76
|
"""The MTSamplesReplicate autograder."""
|
|
92
77
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
auto_client: AutoClient,
|
|
81
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
82
|
+
template_name: Optional[str] = None,
|
|
83
|
+
):
|
|
96
84
|
super().__init__(
|
|
85
|
+
name="mtsamples_replicate",
|
|
97
86
|
auto_client=auto_client,
|
|
98
87
|
prompt_template=PROMPT_TEMPLATE,
|
|
99
88
|
annotation_criteria=ANNOTATION_CRITERIA,
|
|
100
|
-
annotator_models=
|
|
89
|
+
annotator_models=annotator_models,
|
|
101
90
|
)
|
|
@@ -11,17 +11,17 @@ procedure, and how it compares to the gold response in terms of accuracy, comple
|
|
|
11
11
|
|
|
12
12
|
The case will be provided in these tags:
|
|
13
13
|
<case_details>
|
|
14
|
-
{
|
|
14
|
+
{QUESTION}
|
|
15
15
|
</case_details>
|
|
16
16
|
|
|
17
17
|
The response will be provided in these tags:
|
|
18
18
|
<response>
|
|
19
|
-
{
|
|
19
|
+
{RESPONSE}
|
|
20
20
|
</response>
|
|
21
21
|
|
|
22
22
|
The reference response will be provided in these tags:
|
|
23
23
|
<gold_response>
|
|
24
|
-
{
|
|
24
|
+
{GOLD_RESPONSE}
|
|
25
25
|
</gold_response>
|
|
26
26
|
|
|
27
27
|
Carefully review the <response> and compare it to the <gold_response> when needed.
|
|
@@ -68,31 +68,20 @@ ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
|
68
68
|
"clarity": {"score", "explanation"},
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
-
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
72
|
-
"gpt": AnnotatorModelInfo(
|
|
73
|
-
model_name="openai/gpt-4o-2024-05-13",
|
|
74
|
-
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
75
|
-
),
|
|
76
|
-
"llama": AnnotatorModelInfo(
|
|
77
|
-
model_name="meta/llama-3.3-70b-instruct",
|
|
78
|
-
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
79
|
-
),
|
|
80
|
-
"claude": AnnotatorModelInfo(
|
|
81
|
-
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
82
|
-
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
83
|
-
),
|
|
84
|
-
}
|
|
85
|
-
|
|
86
71
|
|
|
87
72
|
class StarrPatientInstructionsAnnotator(LLMAsJuryAnnotator):
|
|
88
73
|
"""The StarrPatientInstructions autograder."""
|
|
89
74
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
auto_client: AutoClient,
|
|
78
|
+
annotator_models: Dict[str, AnnotatorModelInfo],
|
|
79
|
+
template_name: Optional[str] = None,
|
|
80
|
+
):
|
|
93
81
|
super().__init__(
|
|
82
|
+
name="starr_patient_instructions",
|
|
94
83
|
auto_client=auto_client,
|
|
95
84
|
prompt_template=PROMPT_TEMPLATE,
|
|
96
85
|
annotation_criteria=ANNOTATION_CRITERIA,
|
|
97
|
-
annotator_models=
|
|
86
|
+
annotator_models=annotator_models,
|
|
98
87
|
)
|