crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -10,7 +10,7 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
10
10
|
from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS, Reference
|
|
11
11
|
from helm.common.general import parallel_map
|
|
12
12
|
from helm.common.request import Request
|
|
13
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
13
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn
|
|
14
14
|
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
15
15
|
|
|
16
16
|
|
|
@@ -39,8 +39,8 @@ class InContextLearningAdapter(Adapter, ABC):
|
|
|
39
39
|
# Pick out training instances
|
|
40
40
|
all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
|
|
41
41
|
if len(all_train_instances) < self.adapter_spec.max_train_instances:
|
|
42
|
-
|
|
43
|
-
f"
|
|
42
|
+
hwarn(
|
|
43
|
+
f"only {len(all_train_instances)} training instances, "
|
|
44
44
|
f"wanted {self.adapter_spec.max_train_instances}"
|
|
45
45
|
)
|
|
46
46
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import shutil
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from helm.
|
|
6
|
+
from helm.common.local_context import LocalContext
|
|
7
7
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
8
8
|
|
|
9
9
|
|
|
@@ -14,8 +14,8 @@ class TestAdapter:
|
|
|
14
14
|
|
|
15
15
|
def setup_method(self):
|
|
16
16
|
self.path: str = tempfile.mkdtemp()
|
|
17
|
-
|
|
18
|
-
self.tokenizer_service = TokenizerService(
|
|
17
|
+
context = LocalContext(base_path=self.path, cache_backend_config=BlackHoleCacheBackendConfig())
|
|
18
|
+
self.tokenizer_service = TokenizerService(context)
|
|
19
19
|
|
|
20
20
|
def teardown_method(self, _):
|
|
21
21
|
shutil.rmtree(self.path)
|
|
@@ -37,7 +37,7 @@ class AIRBench2024Annotator(Annotator):
|
|
|
37
37
|
)
|
|
38
38
|
self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
|
|
39
39
|
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
40
|
-
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
40
|
+
self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
41
41
|
self._model = model or self._DEFAULT_MODEL
|
|
42
42
|
self._model_deployment = model_deployment or self._DEFAULT_MODEL_DEPLOYMENT
|
|
43
43
|
|
|
@@ -47,7 +47,7 @@ class AIRBench2024Annotator(Annotator):
|
|
|
47
47
|
model_input_text = request_state.request.prompt
|
|
48
48
|
model_output_text = request_state.result.completions[0].text
|
|
49
49
|
if not model_output_text.strip():
|
|
50
|
-
return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score":
|
|
50
|
+
return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 1.0}
|
|
51
51
|
category_id = request_state.instance.references[0].output.text
|
|
52
52
|
prompt_template = self._category_id_to_judge_prompt[category_id]
|
|
53
53
|
# Strip to deal with incorrectly formatted input CSV.
|
|
@@ -9,7 +9,7 @@ from retrying import retry
|
|
|
9
9
|
|
|
10
10
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
11
11
|
from helm.benchmark.annotation.annotator import Annotator
|
|
12
|
-
from helm.common.hierarchical_logger import hlog
|
|
12
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
# Based on https://github.com/bigcode-project/bigcodebench/blob/0331489b29cbf2653b4669597ef431e158882aab/bigcodebench/syncheck.py#L14 # noqa: E501
|
|
@@ -60,8 +60,8 @@ class BigCodeBenchAnnotator(Annotator):
|
|
|
60
60
|
hlog(f"BigCodeBenchAnnotator will use the configured endpoint {endpoint}")
|
|
61
61
|
self.client = Client(endpoint, hf_token=api_key)
|
|
62
62
|
else:
|
|
63
|
-
|
|
64
|
-
f"
|
|
63
|
+
hwarn(
|
|
64
|
+
f"BigCodeBenchAnnotator will use the default public evaluator endpoint {self.DEFAULT_URL} - "
|
|
65
65
|
"set bigcodebenchApiKey and bigcodebenchEndpoint in credentials.conf to use a cloned evaluator instead"
|
|
66
66
|
)
|
|
67
67
|
self.client = Client(self.DEFAULT_URL)
|
|
@@ -6,7 +6,7 @@ import sqlite3
|
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.annotation.annotator import Annotator
|
|
8
8
|
from helm.benchmark.runner import get_benchmark_output_path
|
|
9
|
-
from helm.common.hierarchical_logger import
|
|
9
|
+
from helm.common.hierarchical_logger import hwarn
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class BirdSQLAnnotator(Annotator):
|
|
@@ -34,7 +34,7 @@ class BirdSQLAnnotator(Annotator):
|
|
|
34
34
|
cursor.execute(ground_truth_sql)
|
|
35
35
|
ground_truth_result = cursor.fetchall()
|
|
36
36
|
except (sqlite3.OperationalError, sqlite3.Warning) as e:
|
|
37
|
-
|
|
37
|
+
hwarn(f"Ground truth SQL failed with error: {e}")
|
|
38
38
|
|
|
39
39
|
assert request_state.result is not None
|
|
40
40
|
assert len(request_state.result.completions) == 1
|
|
@@ -5,9 +5,9 @@ from helm.clients.auto_client import AutoClient
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
|
|
8
|
-
generated
|
|
9
|
-
Your goal is to assess how well the generated response captures the necessary information
|
|
10
|
-
|
|
8
|
+
generated clinical note given unstructured clinical text.
|
|
9
|
+
Your goal is to assess how well the generated response captures the necessary information
|
|
10
|
+
and follows provided instructions in terms of accuracy, structure, and clarity.
|
|
11
11
|
|
|
12
12
|
The user's request will be provided in these tags:
|
|
13
13
|
<user_request>
|
|
@@ -19,11 +19,6 @@ The response will be provided in these tags:
|
|
|
19
19
|
{{RESPONSE}}
|
|
20
20
|
</response>
|
|
21
21
|
|
|
22
|
-
The reference response will be provided in these tags:
|
|
23
|
-
<gold_response>
|
|
24
|
-
{{GOLD_RESPONSE}}
|
|
25
|
-
</gold_response>
|
|
26
|
-
|
|
27
22
|
Carefully analyze the <response>.
|
|
28
23
|
For each of the following categories, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
29
24
|
and provide a short justification for your score.
|
|
@@ -32,8 +27,8 @@ Evaluation Criteria:
|
|
|
32
27
|
Accuracy (1-5)
|
|
33
28
|
- Is all the information in the response factually correct?
|
|
34
29
|
|
|
35
|
-
|
|
36
|
-
- Does the response
|
|
30
|
+
Structure (1-5)
|
|
31
|
+
- Does the response contain all parts for the provided note generation structure?
|
|
37
32
|
|
|
38
33
|
Clarity (1-5)
|
|
39
34
|
- Is the response easy to understand for a clinician?
|
|
@@ -45,7 +40,7 @@ Output the evaluation as a single valid JSON object matching the following struc
|
|
|
45
40
|
"score": 0,
|
|
46
41
|
"explanation": "Explain why this score was given."
|
|
47
42
|
},
|
|
48
|
-
"
|
|
43
|
+
"structure": {
|
|
49
44
|
"score": 0,
|
|
50
45
|
"explanation": "Explain why this score was given."
|
|
51
46
|
},
|
|
@@ -64,7 +59,7 @@ Ensure the output is valid JSON:
|
|
|
64
59
|
|
|
65
60
|
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
66
61
|
"accuracy": {"score", "explanation"},
|
|
67
|
-
"
|
|
62
|
+
"structure": {"score", "explanation"},
|
|
68
63
|
"clarity": {"score", "explanation"},
|
|
69
64
|
}
|
|
70
65
|
|
|
@@ -4,7 +4,7 @@ import re
|
|
|
4
4
|
import sqlite3
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.benchmark.annotation.annotator import Annotator
|
|
7
|
-
from helm.common.hierarchical_logger import
|
|
7
|
+
from helm.common.hierarchical_logger import hwarn
|
|
8
8
|
from helm.benchmark.runner import get_benchmark_output_path
|
|
9
9
|
|
|
10
10
|
|
|
@@ -32,7 +32,7 @@ class EhrSqlAnnotator(Annotator):
|
|
|
32
32
|
cursor.execute(ground_truth_sql)
|
|
33
33
|
ground_truth_result = cursor.fetchall()
|
|
34
34
|
except (sqlite3.OperationalError, sqlite3.Warning) as e:
|
|
35
|
-
|
|
35
|
+
hwarn(f"Ground truth SQL failed with error: {e}")
|
|
36
36
|
|
|
37
37
|
# If ground truth SQL execution didn't return results, attempt to use extra_data["value"]
|
|
38
38
|
if not ground_truth_result and request_state.instance.extra_data is not None:
|
|
@@ -5,7 +5,7 @@ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.benchmark.annotation.annotator import Annotator
|
|
7
7
|
from helm.clients.auto_client import AutoClient
|
|
8
|
-
from helm.common.hierarchical_logger import
|
|
8
|
+
from helm.common.hierarchical_logger import hwarn
|
|
9
9
|
from helm.common.request import Request
|
|
10
10
|
from helm.proxy.retry import NonRetriableException
|
|
11
11
|
|
|
@@ -107,8 +107,8 @@ Please respond with your output and reasoning in the following format, your reas
|
|
|
107
107
|
if reasoning_match:
|
|
108
108
|
reasoning = reasoning_match.group(1).strip()
|
|
109
109
|
else:
|
|
110
|
-
|
|
111
|
-
"
|
|
110
|
+
hwarn(
|
|
111
|
+
"HelpdeskCallSummarizationAnnotator could not get Reasoning from annotation from "
|
|
112
112
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
113
113
|
)
|
|
114
114
|
|
|
@@ -116,13 +116,13 @@ Please respond with your output and reasoning in the following format, your reas
|
|
|
116
116
|
try:
|
|
117
117
|
score = float(score_match.group(1).strip())
|
|
118
118
|
except ValueError:
|
|
119
|
-
|
|
120
|
-
"
|
|
119
|
+
hwarn(
|
|
120
|
+
"HelpdeskCallSummarizationAnnotator could not parse Score from annotation from "
|
|
121
121
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
122
122
|
)
|
|
123
123
|
else:
|
|
124
|
-
|
|
125
|
-
"
|
|
124
|
+
hwarn(
|
|
125
|
+
"HelpdeskCallSummarizationAnnotator could not get Score from annotation from "
|
|
126
126
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
127
127
|
)
|
|
128
128
|
|
|
@@ -50,7 +50,7 @@ class LiveQAAnnotator(Annotator):
|
|
|
50
50
|
cache_dir = os.path.join(file_storage_path, "data")
|
|
51
51
|
ensure_directory_exists(cache_dir)
|
|
52
52
|
# Regex pattern is lenient to allow for typos e.g. extra whitespace
|
|
53
|
-
self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
53
|
+
self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
|
|
54
54
|
|
|
55
55
|
def annotate(self, request_state: RequestState) -> Any:
|
|
56
56
|
assert request_state.result
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from typing import Dict, Optional, Set
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
|
|
4
|
+
from helm.clients.auto_client import AutoClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PROMPT_TEMPLATE = """You are tasked with evaluating the quality of the generated brief hospital
|
|
8
|
+
course based on the provided clinical note.
|
|
9
|
+
Your goal is to assess how well the brief hospital course captures all the clinical details and
|
|
10
|
+
compares to the gold response in terms of accuracy, completeness, and clarity.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
The user's request will be provided in these tags:
|
|
14
|
+
<user_request>
|
|
15
|
+
{{QUESTION}}
|
|
16
|
+
</user_request>
|
|
17
|
+
|
|
18
|
+
The response will be provided in these tags:
|
|
19
|
+
<response>
|
|
20
|
+
{{RESPONSE}}
|
|
21
|
+
</response>
|
|
22
|
+
|
|
23
|
+
A potential correct response will be provided in these tags:
|
|
24
|
+
<gold_response>
|
|
25
|
+
{{GOLD_RESPONSE}}
|
|
26
|
+
</gold_response>
|
|
27
|
+
|
|
28
|
+
Carefully analyze the <response>. For each of the following categories,
|
|
29
|
+
rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
|
|
30
|
+
and provide a short justification for your score.
|
|
31
|
+
|
|
32
|
+
Your evaluation should focus on the following criteria:
|
|
33
|
+
Accuracy (1-5)
|
|
34
|
+
- Does the brief hospital course correctly reflect the key details from the clinical note?
|
|
35
|
+
|
|
36
|
+
Completeness (1-5)
|
|
37
|
+
- Does the brief hospital course include all important details and address the clinical scenario?
|
|
38
|
+
|
|
39
|
+
Clarity (1-5)
|
|
40
|
+
-Is the brief hospital course easy for clinicians to understand?
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
Output Format:
|
|
44
|
+
Output the evaluation as a single valid JSON object matching the following structure:
|
|
45
|
+
{
|
|
46
|
+
"accuracy": {
|
|
47
|
+
"score": 0,
|
|
48
|
+
"explanation": "Explain why this score was given."
|
|
49
|
+
},
|
|
50
|
+
"completeness": {
|
|
51
|
+
"score": 0,
|
|
52
|
+
"explanation": "Explain why this score was given."
|
|
53
|
+
},
|
|
54
|
+
"clarity": {
|
|
55
|
+
"score": 0,
|
|
56
|
+
"explanation": "Explain why this score was given."
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
Ensure the output is valid JSON:
|
|
61
|
+
- Use **double quotes** (") for all keys and string values.
|
|
62
|
+
- When quoting text or sections inside the explanations, use escaped double quotes (\") to
|
|
63
|
+
maintain valid JSON formatting.
|
|
64
|
+
- Do not include any additional information in the output.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
|
|
68
|
+
"accuracy": {"score", "explanation"},
|
|
69
|
+
"completeness": {"score", "explanation"},
|
|
70
|
+
"clarity": {"score", "explanation"},
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
|
|
74
|
+
"gpt": AnnotatorModelInfo(
|
|
75
|
+
model_name="openai/gpt-4o-2024-05-13",
|
|
76
|
+
model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
|
|
77
|
+
),
|
|
78
|
+
"llama": AnnotatorModelInfo(
|
|
79
|
+
model_name="meta/llama-3.3-70b-instruct",
|
|
80
|
+
model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
|
|
81
|
+
),
|
|
82
|
+
"claude": AnnotatorModelInfo(
|
|
83
|
+
model_name="anthropic/claude-3-7-sonnet-20250219",
|
|
84
|
+
model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
|
|
85
|
+
),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class MIMICBHCAnnotator(LLMAsJuryAnnotator):
|
|
90
|
+
"""The MIMICBHC autograder."""
|
|
91
|
+
|
|
92
|
+
name = "mimic_bhc"
|
|
93
|
+
|
|
94
|
+
def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
|
|
95
|
+
super().__init__(
|
|
96
|
+
auto_client=auto_client,
|
|
97
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
98
|
+
annotation_criteria=ANNOTATION_CRITERIA,
|
|
99
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
100
|
+
)
|
|
@@ -6,7 +6,7 @@ from typing import Dict, Optional, TypedDict, Union, Callable, Any, Set
|
|
|
6
6
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
7
|
from helm.benchmark.annotation.annotator import Annotator
|
|
8
8
|
from helm.clients.auto_client import AutoClient
|
|
9
|
-
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
10
10
|
from helm.common.request import Request
|
|
11
11
|
|
|
12
12
|
|
|
@@ -184,16 +184,13 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
184
184
|
"""
|
|
185
185
|
for key, value in self._annotation_criteria.items():
|
|
186
186
|
if key not in annotator_criteria:
|
|
187
|
-
|
|
188
|
-
f"WARNING: Annotator did not find the expected key "
|
|
189
|
-
f"'{key}' in the response from {annotator_name}."
|
|
190
|
-
)
|
|
187
|
+
hwarn(f"Annotator did not find the expected key " f"'{key}' in the response from {annotator_name}.")
|
|
191
188
|
return False
|
|
192
189
|
|
|
193
190
|
for subkey in value:
|
|
194
191
|
if subkey not in annotator_criteria[key]:
|
|
195
|
-
|
|
196
|
-
f"
|
|
192
|
+
hwarn(
|
|
193
|
+
f"Annotator did not find the expected subkey "
|
|
197
194
|
f"'{subkey}' in the response from {annotator_name}."
|
|
198
195
|
)
|
|
199
196
|
return False
|
|
@@ -212,7 +209,7 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
212
209
|
# Check for empty model output
|
|
213
210
|
model_output_text = request_state.result.completions[0].text
|
|
214
211
|
if not model_output_text.strip():
|
|
215
|
-
|
|
212
|
+
hwarn("Annotator skipped sending requests because the model response was empty")
|
|
216
213
|
return {
|
|
217
214
|
"prompt_text": None,
|
|
218
215
|
"empty_output_equivalence_judgement": False,
|
|
@@ -264,7 +261,7 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
264
261
|
annotator_response = self._auto_client.make_request(annotator_request)
|
|
265
262
|
|
|
266
263
|
if not annotator_response.success:
|
|
267
|
-
|
|
264
|
+
hwarn(f"Got an error response from {model_info.model_name}: " f"{annotator_response.error}")
|
|
268
265
|
return None
|
|
269
266
|
|
|
270
267
|
try:
|
|
@@ -280,17 +277,16 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
280
277
|
try:
|
|
281
278
|
annotator_criteria = json.loads(annotator_output)
|
|
282
279
|
except Exception as ex:
|
|
283
|
-
|
|
284
|
-
f"
|
|
280
|
+
hwarn(
|
|
281
|
+
f"Error parsing response from {model_info.model_name} "
|
|
285
282
|
f"after adding closing brace: {ex}. "
|
|
286
283
|
f"Model output: {annotator_output}"
|
|
287
284
|
)
|
|
288
285
|
return None
|
|
289
286
|
else:
|
|
290
287
|
# For other JSON decoding errors
|
|
291
|
-
|
|
292
|
-
f"
|
|
293
|
-
f"Model output: {annotator_output}"
|
|
288
|
+
hwarn(
|
|
289
|
+
f"JSON decoding error from {model_info.model_name}: {e}. " f"Model output: {annotator_output}"
|
|
294
290
|
)
|
|
295
291
|
return None
|
|
296
292
|
|
|
@@ -301,8 +297,8 @@ class LLMAsJuryAnnotator(Annotator):
|
|
|
301
297
|
return annotator_criteria
|
|
302
298
|
|
|
303
299
|
except Exception as e:
|
|
304
|
-
|
|
305
|
-
f"
|
|
300
|
+
hwarn(
|
|
301
|
+
f"Unexpected error processing response from {model_info.model_name}: {e}. "
|
|
306
302
|
f"Model output: {annotator_output}"
|
|
307
303
|
)
|
|
308
304
|
return None
|
|
@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
5
5
|
from helm.benchmark.annotation.annotator import Annotator
|
|
6
6
|
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
7
7
|
from helm.clients.auto_client import AutoClient
|
|
8
|
-
from helm.common.hierarchical_logger import
|
|
8
|
+
from helm.common.hierarchical_logger import hwarn
|
|
9
9
|
from helm.common.request import Request
|
|
10
10
|
|
|
11
11
|
|
|
@@ -47,9 +47,8 @@ class OmniMATHAnnotator(Annotator):
|
|
|
47
47
|
.replace("{{Solution}}", model_output_text)
|
|
48
48
|
)
|
|
49
49
|
if not model_output_text.strip():
|
|
50
|
-
|
|
51
|
-
"
|
|
52
|
-
"because the model response was empty"
|
|
50
|
+
hwarn(
|
|
51
|
+
"OmniMATHAnnotator skipped sending requests to annotator models " "because the model response was empty"
|
|
53
52
|
)
|
|
54
53
|
return {
|
|
55
54
|
"prompt_text": None,
|
|
@@ -85,8 +84,8 @@ class OmniMATHAnnotator(Annotator):
|
|
|
85
84
|
)
|
|
86
85
|
annotator_response = self._auto_client.make_request(annotator_request)
|
|
87
86
|
if not annotator_response.success:
|
|
88
|
-
|
|
89
|
-
"
|
|
87
|
+
hwarn(
|
|
88
|
+
"OmniMATHAnnotator got an error response from "
|
|
90
89
|
f"{annotator_model_info.model_name}: {annotator_response.error}"
|
|
91
90
|
)
|
|
92
91
|
else:
|
|
@@ -96,16 +95,16 @@ class OmniMATHAnnotator(Annotator):
|
|
|
96
95
|
try:
|
|
97
96
|
student_final_answer = report_parts["Student Final Answer"]
|
|
98
97
|
except KeyError:
|
|
99
|
-
|
|
100
|
-
"
|
|
98
|
+
hwarn(
|
|
99
|
+
"OmniMATHAnnotator could not get Student Final Answer from annotation from "
|
|
101
100
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
102
101
|
)
|
|
103
102
|
|
|
104
103
|
try:
|
|
105
104
|
justification = report_parts["Justification"].strip().removesuffix("=== report over ===").strip()
|
|
106
105
|
except KeyError:
|
|
107
|
-
|
|
108
|
-
"
|
|
106
|
+
hwarn(
|
|
107
|
+
"OmniMATHAnnotator could not get Justification from annotation from "
|
|
109
108
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
110
109
|
)
|
|
111
110
|
|
|
@@ -116,13 +115,13 @@ class OmniMATHAnnotator(Annotator):
|
|
|
116
115
|
elif equivalence_judgement_str == "FALSE":
|
|
117
116
|
equivalence_judgement = False
|
|
118
117
|
else:
|
|
119
|
-
|
|
120
|
-
"
|
|
118
|
+
hwarn(
|
|
119
|
+
"OmniMATHAnnotator got a non-boolean Equivalence Judgement from annotation from "
|
|
121
120
|
f"{annotator_model_info.model_name}: {equivalence_judgement_str}"
|
|
122
121
|
)
|
|
123
122
|
except KeyError:
|
|
124
|
-
|
|
125
|
-
"
|
|
123
|
+
hwarn(
|
|
124
|
+
"OmniMATHAnnotator could not get Equivalence Judgement from annotation from "
|
|
126
125
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
127
126
|
)
|
|
128
127
|
|
|
@@ -7,7 +7,7 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
7
7
|
from helm.benchmark.annotation.annotator import Annotator
|
|
8
8
|
from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
|
|
9
9
|
from helm.clients.auto_client import AutoClient
|
|
10
|
-
from helm.common.hierarchical_logger import
|
|
10
|
+
from helm.common.hierarchical_logger import hwarn
|
|
11
11
|
from helm.common.request import Request
|
|
12
12
|
|
|
13
13
|
|
|
@@ -32,8 +32,8 @@ class WildBenchAnnotator(Annotator):
|
|
|
32
32
|
model_output_text = request_state.result.completions[0].text
|
|
33
33
|
if not model_output_text.strip():
|
|
34
34
|
# Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
|
|
35
|
-
|
|
36
|
-
"
|
|
35
|
+
hwarn(
|
|
36
|
+
"WildBenchAnnotator skipped sending requests to annotator models "
|
|
37
37
|
"because the model response was empty"
|
|
38
38
|
)
|
|
39
39
|
return {
|
|
@@ -87,8 +87,8 @@ class WildBenchAnnotator(Annotator):
|
|
|
87
87
|
score: Optional[float] = None
|
|
88
88
|
annotator_response = self._auto_client.make_request(annotator_request)
|
|
89
89
|
if not annotator_response.success:
|
|
90
|
-
|
|
91
|
-
"
|
|
90
|
+
hwarn(
|
|
91
|
+
"WildBenchAnnotator got an error response from "
|
|
92
92
|
f"{annotator_model_info.model_name}: : {annotator_response.error}"
|
|
93
93
|
)
|
|
94
94
|
else:
|
|
@@ -96,8 +96,8 @@ class WildBenchAnnotator(Annotator):
|
|
|
96
96
|
annotator_response_text = annotator_response.completions[0].text
|
|
97
97
|
annotator_response_parts = self._pattern.search(annotator_response_text)
|
|
98
98
|
if not annotator_response_parts:
|
|
99
|
-
|
|
100
|
-
"
|
|
99
|
+
hwarn(
|
|
100
|
+
"WildBenchAnnotator got a malformed annotation from "
|
|
101
101
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
102
102
|
)
|
|
103
103
|
else:
|
|
@@ -107,8 +107,8 @@ class WildBenchAnnotator(Annotator):
|
|
|
107
107
|
try:
|
|
108
108
|
score = float(score_text)
|
|
109
109
|
except ValueError:
|
|
110
|
-
|
|
111
|
-
"
|
|
110
|
+
hwarn(
|
|
111
|
+
"WildBenchAnnotator could not parse the score from the annotation from "
|
|
112
112
|
f"{annotator_model_info.model_name}: {annotator_response_text}"
|
|
113
113
|
)
|
|
114
114
|
|
helm/benchmark/executor.py
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
from dataclasses import dataclass, replace
|
|
3
|
+
|
|
4
|
+
from helm.common.context import Context
|
|
5
|
+
from helm.common.local_context import LocalContext
|
|
6
|
+
from helm.common.remote_context import RemoteContext
|
|
3
7
|
from helm.common.cache_backend_config import (
|
|
4
8
|
CacheBackendConfig,
|
|
5
9
|
BlackHoleCacheBackendConfig,
|
|
6
10
|
MongoCacheBackendConfig,
|
|
7
11
|
SqliteCacheBackendConfig,
|
|
8
12
|
)
|
|
9
|
-
|
|
10
13
|
from helm.common.general import parallel_map
|
|
11
|
-
from helm.common.hierarchical_logger import htrack, hlog
|
|
14
|
+
from helm.common.hierarchical_logger import htrack, hlog, hwarn
|
|
12
15
|
from helm.common.request import RequestResult, GeneratedOutput
|
|
13
16
|
from helm.common.authentication import Authentication
|
|
14
|
-
from helm.proxy.services.remote_service import RemoteService
|
|
15
|
-
from helm.proxy.services.server_service import ServerService
|
|
16
|
-
from helm.proxy.services.service import Service
|
|
17
17
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
18
18
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
19
19
|
|
|
@@ -29,7 +29,7 @@ class ExecutionSpec:
|
|
|
29
29
|
"""If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""
|
|
30
30
|
|
|
31
31
|
auth: Authentication
|
|
32
|
-
"""Authentication that will be passed into the
|
|
32
|
+
"""Authentication that will be passed into the remote service, if using the remote context."""
|
|
33
33
|
|
|
34
34
|
local_path: Optional[str]
|
|
35
35
|
"""Path where API credentials and cache is stored.
|
|
@@ -75,15 +75,14 @@ class Executor:
|
|
|
75
75
|
else:
|
|
76
76
|
cache_backend_config = BlackHoleCacheBackendConfig()
|
|
77
77
|
|
|
78
|
-
self.
|
|
78
|
+
self.context: Context
|
|
79
79
|
if execution_spec.url:
|
|
80
80
|
hlog(f"Running using remote API proxy server: {execution_spec.url}")
|
|
81
|
-
self.
|
|
81
|
+
self.context = RemoteContext(execution_spec.url, execution_spec.auth)
|
|
82
82
|
elif execution_spec.local_path:
|
|
83
83
|
hlog(f"Running in local mode with base path: {execution_spec.local_path}")
|
|
84
|
-
self.
|
|
84
|
+
self.context = LocalContext(
|
|
85
85
|
base_path=execution_spec.local_path,
|
|
86
|
-
root_mode=True,
|
|
87
86
|
cache_backend_config=cache_backend_config,
|
|
88
87
|
)
|
|
89
88
|
else:
|
|
@@ -111,12 +110,12 @@ class Executor:
|
|
|
111
110
|
|
|
112
111
|
def process(self, state: RequestState) -> RequestState:
|
|
113
112
|
try:
|
|
114
|
-
result: RequestResult = self.
|
|
113
|
+
result: RequestResult = self.context.make_request(state.request)
|
|
115
114
|
except Exception as e:
|
|
116
115
|
raise ExecutorError(f"{str(e)} Request: {state.request}") from e
|
|
117
116
|
if not result.success:
|
|
118
117
|
if result.error_flags and not result.error_flags.is_fatal:
|
|
119
|
-
|
|
118
|
+
hwarn(f"Non-fatal error treated as empty completion: {result.error}")
|
|
120
119
|
result.completions = [GeneratedOutput(text="", logprob=0, tokens=[])]
|
|
121
120
|
else:
|
|
122
121
|
raise ExecutorError(f"{str(result.error)} Request: {state.request}")
|
|
@@ -1,34 +1,14 @@
|
|
|
1
|
-
from typing import Any, Dict, List
|
|
2
|
-
|
|
3
|
-
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
-
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
1
|
from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
|
|
6
|
-
from helm.benchmark.metrics.
|
|
7
|
-
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
-
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
-
from helm.benchmark.metrics.statistic import Stat
|
|
2
|
+
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
10
3
|
|
|
11
4
|
|
|
12
|
-
class ACIBenchMetric(
|
|
5
|
+
class ACIBenchMetric(LLMJuryMetric):
|
|
13
6
|
"""Score metrics for ACIBench."""
|
|
14
7
|
|
|
15
|
-
def
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
assert request_state.annotations
|
|
23
|
-
annotations: Dict[str, Any] = request_state.annotations["aci_bench"]
|
|
24
|
-
scores: List[int] = []
|
|
25
|
-
score = 0.0
|
|
26
|
-
for annotation_key, annotation_dict in annotations.items():
|
|
27
|
-
if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
|
|
28
|
-
for val in annotation_dict.values():
|
|
29
|
-
scores.append(int(val["score"]))
|
|
30
|
-
if scores:
|
|
31
|
-
score = sum(scores) / len(scores)
|
|
32
|
-
return [
|
|
33
|
-
Stat(MetricName("aci_bench_accuracy")).add(score),
|
|
34
|
-
]
|
|
8
|
+
def __init__(self):
|
|
9
|
+
super().__init__(
|
|
10
|
+
metric_name="aci_bench_accuracy",
|
|
11
|
+
scenario_name="aci_bench",
|
|
12
|
+
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
+
default_score=1.0,
|
|
14
|
+
)
|