PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.1.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

helm/benchmark/adaptation/adapters/in_context_learning_adapter.py CHANGED Viewed

@@ -10,7 +10,7 @@ from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, EVAL_SPLITS, Reference
 from helm.common.general import parallel_map
 from helm.common.request import Request
-from helm.common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn
 from helm.benchmark.adaptation.adapters.adapter import Adapter
@@ -39,8 +39,8 @@ class InContextLearningAdapter(Adapter, ABC):
         # Pick out training instances
         all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
         if len(all_train_instances) < self.adapter_spec.max_train_instances:
-            hlog(
-                f"WARNING: only {len(all_train_instances)} training instances, "
+            hwarn(
+                f"only {len(all_train_instances)} training instances, "
                 f"wanted {self.adapter_spec.max_train_instances}"
             )

helm/benchmark/adaptation/adapters/test_adapter.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import shutil
 import tempfile
-from helm.common.authentication import Authentication
 from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
-from helm.proxy.services.server_service import ServerService
+from helm.common.local_context import LocalContext
 from helm.benchmark.window_services.tokenizer_service import TokenizerService
@@ -14,8 +14,8 @@ class TestAdapter:
     def setup_method(self):
         self.path: str = tempfile.mkdtemp()
-        service = ServerService(base_path=self.path, root_mode=True, cache_backend_config=BlackHoleCacheBackendConfig())
-        self.tokenizer_service = TokenizerService(service, Authentication("test"))
+        context = LocalContext(base_path=self.path, cache_backend_config=BlackHoleCacheBackendConfig())
+        self.tokenizer_service = TokenizerService(context)
     def teardown_method(self, _):
         shutil.rmtree(self.path)

helm/benchmark/annotation/air_bench_annotator.py CHANGED Viewed

@@ -37,7 +37,7 @@ class AIRBench2024Annotator(Annotator):
         )
         self._category_id_to_judge_prompt = {row["cate-idx"]: row["judge_prompt"] for row in dataset}
         # Regex pattern is lenient to allow for typos e.g. extra whitespace
-        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+        self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
         self._model = model or self._DEFAULT_MODEL
         self._model_deployment = model_deployment or self._DEFAULT_MODEL_DEPLOYMENT
@@ -47,7 +47,7 @@ class AIRBench2024Annotator(Annotator):
         model_input_text = request_state.request.prompt
         model_output_text = request_state.result.completions[0].text
         if not model_output_text.strip():
-            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 0.0}
+            return {"prompt_text": "", "reasoning": "BLOCKED_REQUEST_OR_EMPTY_RESPONSE", "score": 1.0}
         category_id = request_state.instance.references[0].output.text
         prompt_template = self._category_id_to_judge_prompt[category_id]
         # Strip to deal with incorrectly formatted input CSV.

helm/benchmark/annotation/bigcodebench_annotator.py CHANGED Viewed

@@ -9,7 +9,7 @@ from retrying import retry
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, hwarn
 # Based on https://github.com/bigcode-project/bigcodebench/blob/0331489b29cbf2653b4669597ef431e158882aab/bigcodebench/syncheck.py#L14  # noqa: E501
@@ -60,8 +60,8 @@ class BigCodeBenchAnnotator(Annotator):
             hlog(f"BigCodeBenchAnnotator will use the configured endpoint {endpoint}")
             self.client = Client(endpoint, hf_token=api_key)
         else:
-            hlog(
-                f"WARNING: BigCodeBenchAnnotator will use the default public evaluator endpoint {self.DEFAULT_URL} - "
+            hwarn(
+                f"BigCodeBenchAnnotator will use the default public evaluator endpoint {self.DEFAULT_URL} - "
                 "set bigcodebenchApiKey and bigcodebenchEndpoint in credentials.conf to use a cloned evaluator instead"
             )
             self.client = Client(self.DEFAULT_URL)

helm/benchmark/annotation/bird_sql_annotator.py CHANGED Viewed

@@ -6,7 +6,7 @@ import sqlite3
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
 from helm.benchmark.runner import get_benchmark_output_path
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 class BirdSQLAnnotator(Annotator):
@@ -34,7 +34,7 @@ class BirdSQLAnnotator(Annotator):
             cursor.execute(ground_truth_sql)
             ground_truth_result = cursor.fetchall()
         except (sqlite3.OperationalError, sqlite3.Warning) as e:
-            hlog(f"WARNING: Ground truth SQL failed with error: {e}")
+            hwarn(f"Ground truth SQL failed with error: {e}")
         assert request_state.result is not None
         assert len(request_state.result.completions) == 1

helm/benchmark/annotation/chw_care_plan_annotator.py CHANGED Viewed

@@ -5,9 +5,9 @@ from helm.clients.auto_client import AutoClient
 PROMPT_TEMPLATE = """You are a medical expert tasked with evaluating the quality of a
-generated response of a clinical scenario.
-Your goal is to assess how well the generated response captures the necessary information and
-how it compares to the gold response in terms of accuracy, completeness, and clarity.
+generated clinical note given unstructured clinical text.
+Your goal is to assess how well the generated response captures the necessary information
+and follows provided instructions in terms of accuracy, structure, and clarity.
 The user's request will be provided in these tags:
 <user_request>
@@ -19,11 +19,6 @@ The response will be provided in these tags:
 {{RESPONSE}}
 </response>
-The reference response will be provided in these tags:
-<gold_response>
-{{GOLD_RESPONSE}}
-</gold_response>
 Carefully analyze the <response>.
 For each of the following categories, rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
 and provide a short justification for your score.
@@ -32,8 +27,8 @@ Evaluation Criteria:
 Accuracy (1-5)
 - Is all the information in the response factually correct?
-Completeness (1-5)
-- Does the response include all necessary information from the gold response?
+Structure (1-5)
+- Does the response contain all parts for the provided note generation structure?
 Clarity (1-5)
 - Is the response easy to understand for a clinician?
@@ -45,7 +40,7 @@ Output the evaluation as a single valid JSON object matching the following struc
         "score": 0,
         "explanation": "Explain why this score was given."
     },
-    "completeness": {
+    "structure": {
         "score": 0,
         "explanation": "Explain why this score was given."
     },
@@ -64,7 +59,7 @@ Ensure the output is valid JSON:
 ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
     "accuracy": {"score", "explanation"},
-    "completeness": {"score", "explanation"},
+    "structure": {"score", "explanation"},
     "clarity": {"score", "explanation"},
 }

helm/benchmark/annotation/ehr_sql_annotator.py CHANGED Viewed

@@ -4,7 +4,7 @@ import re
 import sqlite3
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.benchmark.runner import get_benchmark_output_path
@@ -32,7 +32,7 @@ class EhrSqlAnnotator(Annotator):
                 cursor.execute(ground_truth_sql)
                 ground_truth_result = cursor.fetchall()
         except (sqlite3.OperationalError, sqlite3.Warning) as e:
-            hlog(f"WARNING: Ground truth SQL failed with error: {e}")
+            hwarn(f"Ground truth SQL failed with error: {e}")
         # If ground truth SQL execution didn't return results, attempt to use extra_data["value"]
         if not ground_truth_result and request_state.instance.extra_data is not None:

helm/benchmark/annotation/helpdesk_call_summarization_annotator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
 from helm.clients.auto_client import AutoClient
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.request import Request
 from helm.proxy.retry import NonRetriableException
@@ -107,8 +107,8 @@ Please respond with your output and reasoning in the following format, your reas
             if reasoning_match:
                 reasoning = reasoning_match.group(1).strip()
             else:
-                hlog(
-                    "WARNING: HelpdeskCallSummarizationAnnotator could not get Reasoning from annotation from "
+                hwarn(
+                    "HelpdeskCallSummarizationAnnotator could not get Reasoning from annotation from "
                     f"{annotator_model_info.model_name}: {annotator_response_text}"
                 )
@@ -116,13 +116,13 @@ Please respond with your output and reasoning in the following format, your reas
                 try:
                     score = float(score_match.group(1).strip())
                 except ValueError:
-                    hlog(
-                        "WARNING: HelpdeskCallSummarizationAnnotator could not parse Score from annotation from "
+                    hwarn(
+                        "HelpdeskCallSummarizationAnnotator could not parse Score from annotation from "
                         f"{annotator_model_info.model_name}: {annotator_response_text}"
                     )
             else:
-                hlog(
-                    "WARNING: HelpdeskCallSummarizationAnnotator could not get Score from annotation from "
+                hwarn(
+                    "HelpdeskCallSummarizationAnnotator could not get Score from annotation from "
                     f"{annotator_model_info.model_name}: {annotator_response_text}"
                 )

helm/benchmark/annotation/live_qa_annotator.py CHANGED Viewed

@@ -50,7 +50,7 @@ class LiveQAAnnotator(Annotator):
         cache_dir = os.path.join(file_storage_path, "data")
         ensure_directory_exists(cache_dir)
         # Regex pattern is lenient to allow for typos e.g. extra whitespace
-        self._pattern = re.compile("##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
+        self._pattern = re.compile(r"##\s*short_reasoning\s*:(.*)##\s*the_score\s*:(.*)", re.DOTALL)
     def annotate(self, request_state: RequestState) -> Any:
         assert request_state.result

helm/benchmark/annotation/mimic_bhc_annotator.py ADDED Viewed

@@ -0,0 +1,100 @@
+from typing import Dict, Optional, Set
+from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo, LLMAsJuryAnnotator
+from helm.clients.auto_client import AutoClient
+PROMPT_TEMPLATE = """You are tasked with evaluating the quality of the generated brief hospital
+course based on the provided clinical note.
+Your goal is to assess how well the brief hospital course captures all the clinical details and
+compares to the gold response in terms of accuracy, completeness, and clarity.
+The user's request will be provided in these tags:
+<user_request>
+{{QUESTION}}
+</user_request>
+The response will be provided in these tags:
+<response>
+{{RESPONSE}}
+</response>
+A potential correct response will be provided in these tags:
+<gold_response>
+{{GOLD_RESPONSE}}
+</gold_response>
+Carefully analyze the <response>. For each of the following categories,
+rate the response on a scale of 1 to 5 (1 = very poor, 5 = excellent),
+and provide a short justification for your score.
+Your evaluation should focus on the following criteria:
+Accuracy (1-5)
+- Does the brief hospital course correctly reflect the key details from the clinical note?
+Completeness (1-5)
+- Does the brief hospital course include all important details and address the clinical scenario?
+Clarity (1-5)
+-Is the brief hospital course easy for clinicians to understand?
+Output Format:
+Output the evaluation as a single valid JSON object matching the following structure:
+{
+    "accuracy": {
+        "score": 0,
+        "explanation": "Explain why this score was given."
+    },
+    "completeness": {
+        "score": 0,
+        "explanation": "Explain why this score was given."
+    },
+    "clarity": {
+        "score": 0,
+        "explanation": "Explain why this score was given."
+    }
+}
+Ensure the output is valid JSON:
+- Use **double quotes** (") for all keys and string values.
+- When quoting text or sections inside the explanations, use escaped double quotes (\") to
+  maintain valid JSON formatting.
+- Do not include any additional information in the output.
+"""
+ANNOTATION_CRITERIA: Dict[str, Set[str]] = {
+    "accuracy": {"score", "explanation"},
+    "completeness": {"score", "explanation"},
+    "clarity": {"score", "explanation"},
+}
+ANNOTATOR_MODELS: Dict[str, AnnotatorModelInfo] = {
+    "gpt": AnnotatorModelInfo(
+        model_name="openai/gpt-4o-2024-05-13",
+        model_deployment="stanfordhealthcare/gpt-4o-2024-05-13",
+    ),
+    "llama": AnnotatorModelInfo(
+        model_name="meta/llama-3.3-70b-instruct",
+        model_deployment="stanfordhealthcare/llama-3.3-70b-instruct",
+    ),
+    "claude": AnnotatorModelInfo(
+        model_name="anthropic/claude-3-7-sonnet-20250219",
+        model_deployment="stanfordhealthcare/claude-3-7-sonnet-20250219",
+    ),
+}
+class MIMICBHCAnnotator(LLMAsJuryAnnotator):
+    """The MIMICBHC autograder."""
+    name = "mimic_bhc"
+    def __init__(self, auto_client: AutoClient, template_name: Optional[str] = None):
+        super().__init__(
+            auto_client=auto_client,
+            prompt_template=PROMPT_TEMPLATE,
+            annotation_criteria=ANNOTATION_CRITERIA,
+            annotator_models=ANNOTATOR_MODELS,
+        )

helm/benchmark/annotation/model_as_judge.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Dict, Optional, TypedDict, Union, Callable, Any, Set
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
 from helm.clients.auto_client import AutoClient
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.common.request import Request
@@ -184,16 +184,13 @@ class LLMAsJuryAnnotator(Annotator):
         """
         for key, value in self._annotation_criteria.items():
             if key not in annotator_criteria:
-                hlog(
-                    f"WARNING: Annotator did not find the expected key "
-                    f"'{key}' in the response from {annotator_name}."
-                )
+                hwarn(f"Annotator did not find the expected key " f"'{key}' in the response from {annotator_name}.")
                 return False
             for subkey in value:
                 if subkey not in annotator_criteria[key]:
-                    hlog(
-                        f"WARNING: Annotator did not find the expected subkey "
+                    hwarn(
+                        f"Annotator did not find the expected subkey "
                         f"'{subkey}' in the response from {annotator_name}."
                     )
                     return False
@@ -212,7 +209,7 @@ class LLMAsJuryAnnotator(Annotator):
         # Check for empty model output
         model_output_text = request_state.result.completions[0].text
         if not model_output_text.strip():
-            hlog("WARNING: Annotator skipped sending requests because the model response was empty")
+            hwarn("Annotator skipped sending requests because the model response was empty")
             return {
                 "prompt_text": None,
                 "empty_output_equivalence_judgement": False,
@@ -264,7 +261,7 @@ class LLMAsJuryAnnotator(Annotator):
         annotator_response = self._auto_client.make_request(annotator_request)
         if not annotator_response.success:
-            hlog(f"WARNING: Got an error response from {model_info.model_name}: " f"{annotator_response.error}")
+            hwarn(f"Got an error response from {model_info.model_name}: " f"{annotator_response.error}")
             return None
         try:
@@ -280,17 +277,16 @@ class LLMAsJuryAnnotator(Annotator):
                     try:
                         annotator_criteria = json.loads(annotator_output)
                     except Exception as ex:
-                        hlog(
-                            f"WARNING: Error parsing response from {model_info.model_name} "
+                        hwarn(
+                            f"Error parsing response from {model_info.model_name} "
                             f"after adding closing brace: {ex}. "
                             f"Model output: {annotator_output}"
                         )
                         return None
                 else:
                     # For other JSON decoding errors
-                    hlog(
-                        f"WARNING: JSON decoding error from {model_info.model_name}: {e}. "
-                        f"Model output: {annotator_output}"
+                    hwarn(
+                        f"JSON decoding error from {model_info.model_name}: {e}. " f"Model output: {annotator_output}"
                     )
                     return None
@@ -301,8 +297,8 @@ class LLMAsJuryAnnotator(Annotator):
             return annotator_criteria
         except Exception as e:
-            hlog(
-                f"WARNING: Unexpected error processing response from {model_info.model_name}: {e}. "
+            hwarn(
+                f"Unexpected error processing response from {model_info.model_name}: {e}. "
                 f"Model output: {annotator_output}"
             )
             return None

helm/benchmark/annotation/omni_math_annotator.py CHANGED Viewed

@@ -5,7 +5,7 @@ from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
 from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
 from helm.clients.auto_client import AutoClient
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.request import Request
@@ -47,9 +47,8 @@ class OmniMATHAnnotator(Annotator):
             .replace("{{Solution}}", model_output_text)
         )
         if not model_output_text.strip():
-            hlog(
-                "WARNING: OmniMATHAnnotator skipped sending requests to annotator models "
-                "because the model response was empty"
+            hwarn(
+                "OmniMATHAnnotator skipped sending requests to annotator models " "because the model response was empty"
             )
             return {
                 "prompt_text": None,
@@ -85,8 +84,8 @@ class OmniMATHAnnotator(Annotator):
             )
             annotator_response = self._auto_client.make_request(annotator_request)
             if not annotator_response.success:
-                hlog(
-                    "WARNING: OmniMATHAnnotator got an error response from "
+                hwarn(
+                    "OmniMATHAnnotator got an error response from "
                     f"{annotator_model_info.model_name}: {annotator_response.error}"
                 )
             else:
@@ -96,16 +95,16 @@ class OmniMATHAnnotator(Annotator):
                 try:
                     student_final_answer = report_parts["Student Final Answer"]
                 except KeyError:
-                    hlog(
-                        "WARNING: OmniMATHAnnotator could not get Student Final Answer from annotation from "
+                    hwarn(
+                        "OmniMATHAnnotator could not get Student Final Answer from annotation from "
                         f"{annotator_model_info.model_name}: {annotator_response_text}"
                     )
                 try:
                     justification = report_parts["Justification"].strip().removesuffix("=== report over ===").strip()
                 except KeyError:
-                    hlog(
-                        "WARNING: OmniMATHAnnotator could not get Justification from annotation from "
+                    hwarn(
+                        "OmniMATHAnnotator could not get Justification from annotation from "
                         f"{annotator_model_info.model_name}: {annotator_response_text}"
                     )
@@ -116,13 +115,13 @@ class OmniMATHAnnotator(Annotator):
                     elif equivalence_judgement_str == "FALSE":
                         equivalence_judgement = False
                     else:
-                        hlog(
-                            "WARNING: OmniMATHAnnotator got a non-boolean Equivalence Judgement from annotation from "
+                        hwarn(
+                            "OmniMATHAnnotator got a non-boolean Equivalence Judgement from annotation from "
                             f"{annotator_model_info.model_name}: {equivalence_judgement_str}"
                         )
                 except KeyError:
-                    hlog(
-                        "WARNING: OmniMATHAnnotator could not get Equivalence Judgement from annotation from "
+                    hwarn(
+                        "OmniMATHAnnotator could not get Equivalence Judgement from annotation from "
                         f"{annotator_model_info.model_name}: {annotator_response_text}"
                     )

helm/benchmark/annotation/wildbench_annotator.py CHANGED Viewed

@@ -7,7 +7,7 @@ from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.annotator import Annotator
 from helm.benchmark.annotation.model_as_judge import AnnotatorModelInfo
 from helm.clients.auto_client import AutoClient
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hwarn
 from helm.common.request import Request
@@ -32,8 +32,8 @@ class WildBenchAnnotator(Annotator):
         model_output_text = request_state.result.completions[0].text
         if not model_output_text.strip():
             # Following https://github.com/allenai/WildBench/blob/d6b8dcaf377d173d031980f97c16e1a82618c03d/src/eval.py
-            hlog(
-                "WARNING: WildBenchAnnotator skipped sending requests to annotator models "
+            hwarn(
+                "WildBenchAnnotator skipped sending requests to annotator models "
                 "because the model response was empty"
             )
             return {
@@ -87,8 +87,8 @@ class WildBenchAnnotator(Annotator):
             score: Optional[float] = None
             annotator_response = self._auto_client.make_request(annotator_request)
             if not annotator_response.success:
-                hlog(
-                    "WARNING: WildBenchAnnotator got an error response from "
+                hwarn(
+                    "WildBenchAnnotator got an error response from "
                     f"{annotator_model_info.model_name}: : {annotator_response.error}"
                 )
             else:
@@ -96,8 +96,8 @@ class WildBenchAnnotator(Annotator):
                 annotator_response_text = annotator_response.completions[0].text
                 annotator_response_parts = self._pattern.search(annotator_response_text)
                 if not annotator_response_parts:
-                    hlog(
-                        "WARNING: WildBenchAnnotator got a malformed annotation from "
+                    hwarn(
+                        "WildBenchAnnotator got a malformed annotation from "
                         f"{annotator_model_info.model_name}: {annotator_response_text}"
                     )
                 else:
@@ -107,8 +107,8 @@ class WildBenchAnnotator(Annotator):
                     try:
                         score = float(score_text)
                     except ValueError:
-                        hlog(
-                            "WARNING: WildBenchAnnotator could not parse the score from the annotation from "
+                        hwarn(
+                            "WildBenchAnnotator could not parse the score from the annotation from "
                             f"{annotator_model_info.model_name}: {annotator_response_text}"
                         )

helm/benchmark/executor.py CHANGED Viewed

@@ -1,19 +1,19 @@
 from typing import Optional
 from dataclasses import dataclass, replace
+from helm.common.context import Context
+from helm.common.local_context import LocalContext
+from helm.common.remote_context import RemoteContext
 from helm.common.cache_backend_config import (
     CacheBackendConfig,
     BlackHoleCacheBackendConfig,
     MongoCacheBackendConfig,
     SqliteCacheBackendConfig,
 )
 from helm.common.general import parallel_map
-from helm.common.hierarchical_logger import htrack, hlog
+from helm.common.hierarchical_logger import htrack, hlog, hwarn
 from helm.common.request import RequestResult, GeneratedOutput
 from helm.common.authentication import Authentication
-from helm.proxy.services.remote_service import RemoteService
-from helm.proxy.services.server_service import ServerService
-from helm.proxy.services.service import Service
 from helm.benchmark.adaptation.scenario_state import ScenarioState
 from helm.benchmark.adaptation.request_state import RequestState
@@ -29,7 +29,7 @@ class ExecutionSpec:
     """If non-empty, URL of the proxy server we send requests to (e.g., http://localhost:1959)."""
     auth: Authentication
-    """Authentication that will be passed into the local service, if using the local service."""
+    """Authentication that will be passed into the remote service, if using the remote context."""
     local_path: Optional[str]
     """Path where API credentials and cache is stored.
@@ -75,15 +75,14 @@ class Executor:
         else:
             cache_backend_config = BlackHoleCacheBackendConfig()
-        self.service: Service
+        self.context: Context
         if execution_spec.url:
             hlog(f"Running using remote API proxy server: {execution_spec.url}")
-            self.service = RemoteService(execution_spec.url)
+            self.context = RemoteContext(execution_spec.url, execution_spec.auth)
         elif execution_spec.local_path:
             hlog(f"Running in local mode with base path: {execution_spec.local_path}")
-            self.service = ServerService(
+            self.context = LocalContext(
                 base_path=execution_spec.local_path,
-                root_mode=True,
                 cache_backend_config=cache_backend_config,
             )
         else:
@@ -111,12 +110,12 @@ class Executor:
     def process(self, state: RequestState) -> RequestState:
         try:
-            result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request)
+            result: RequestResult = self.context.make_request(state.request)
         except Exception as e:
             raise ExecutorError(f"{str(e)} Request: {state.request}") from e
         if not result.success:
             if result.error_flags and not result.error_flags.is_fatal:
-                hlog(f"WARNING: Non-fatal error treated as empty completion: {result.error}")
+                hwarn(f"Non-fatal error treated as empty completion: {result.error}")
                 result.completions = [GeneratedOutput(text="", logprob=0, tokens=[])]
             else:
                 raise ExecutorError(f"{str(result.error)} Request: {state.request}")

helm/benchmark/metrics/aci_bench_metrics.py CHANGED Viewed

@@ -1,34 +1,14 @@
-from typing import Any, Dict, List
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.benchmark.adaptation.request_state import RequestState
 from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.metric import Metric
-from helm.benchmark.metrics.metric_name import MetricName
-from helm.benchmark.metrics.metric_service import MetricService
-from helm.benchmark.metrics.statistic import Stat
+from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class ACIBenchMetric(Metric):
+class ACIBenchMetric(LLMJuryMetric):
     """Score metrics for ACIBench."""
-    def evaluate_generation(
-        self,
-        adapter_spec: AdapterSpec,
-        request_state: RequestState,
-        metric_service: MetricService,
-        eval_cache_path: str,
-    ) -> List[Stat]:
-        assert request_state.annotations
-        annotations: Dict[str, Any] = request_state.annotations["aci_bench"]
-        scores: List[int] = []
-        score = 0.0
-        for annotation_key, annotation_dict in annotations.items():
-            if annotation_key in ANNOTATOR_MODELS.keys() and annotation_dict is not None:
-                for val in annotation_dict.values():
-                    scores.append(int(val["score"]))
-        if scores:
-            score = sum(scores) / len(scores)
-        return [
-            Stat(MetricName("aci_bench_accuracy")).add(score),
-        ]
+    def __init__(self):
+        super().__init__(
+            metric_name="aci_bench_accuracy",
+            scenario_name="aci_bench",
+            annotator_models=ANNOTATOR_MODELS,
+            default_score=1.0,
+        )

helm/benchmark/metrics/bias_word_lists.py CHANGED Viewed

@@ -1,4 +1,4 @@
-""" Bias words utilised to compute the bias metrics.
+"""Bias words utilised to compute the bias metrics.
 This file includes word lists for the following 4 categories:
     - Race (Asian, Hispanic, and White): Most common names for each race (Garg et al. 2018)

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl