PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/model_deployment_registry.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 import cattrs
 import yaml
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.common.object_spec import ObjectSpec
 from helm.benchmark.model_metadata_registry import (
     ModelMetadata,
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
     try:
         model_metadata = get_model_metadata(model_name)
     except ValueError:
-        hlog(
-            f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
-        )
+        hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
         model_metadata = get_unknown_model_metadata(model_name)
         register_model_metadata(model_metadata)
     deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
         raise ValueError(f"Model deployment {name} not found")
     deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
     if deployment.deprecated and warn_deprecated:
-        hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
+        hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
     return deployment
@@ -159,12 +157,11 @@ def get_default_model_deployment_for_model(
     Example: "meta/llama-7b" => "together/llama-7b"
     The process to find a model deployment name is as follows:
-    1. If there is a model deployment with the same name as the model arg, use it.
-    2. If there is at least one deployment for the model, use the first one that is available.
-    3. If there are no deployments for the model, returns None.
+    1. If there is at least one deployment for the model, use the last one that is available.
+    2. If there are no deployments for the model, returns None.
     This function will also try to find a model deployment name that is not deprecated.
-    If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated).
+    If there are no non-deprecated deployments, it will return the last deployment (even if it's deprecated).
     If ignore_deprecated is True, this function will return None if the model deployment is deprecated.
     If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same
@@ -177,23 +174,14 @@ def get_default_model_deployment_for_model(
         ignore_deprecated: Whether to return None if the model deployment is deprecated.
     """
-    # If there is a model deployment with the same name as the model arg, use it.
-    if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT:
-        deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
-        if deployment.deprecated and ignore_deprecated:
-            if warn_arg_deprecated:
-                hlog(f"WARNING: Model deployment {model_name} is deprecated")
-            return None
-        return deployment.name
-    # If there is at least one deployment for the model, use the first one that is available.
+    # If there is at least one deployment for the model, use the last one that is available.
     available_deployments: List[ModelDeployment] = [
         deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name
     ]
     if len(available_deployments) > 0:
         available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
         if warn_arg_deprecated:
-            hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
+            hwarn("Model name is deprecated. Please use the model deployment name instead.")
             hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
         # Additionally, if there is a non-deprecated deployment, use it.
@@ -201,19 +189,21 @@ def get_default_model_deployment_for_model(
             deployment for deployment in available_deployments if not deployment.deprecated
         ]
         if len(non_deprecated_deployments) > 0:
-            chosen_deployment = non_deprecated_deployments[0]
+            chosen_deployment = non_deprecated_deployments[-1]
         # There are no non-deprecated deployments, so there are two options:
         # 1. If we can return an empty string, return it. (no model deployment is available)
-        # 2. If we can't return an empty string, return the first deployment (even if it's deprecated).
+        # 2. If we can't return an empty string, return the last deployment (even if it's deprecated).
         elif ignore_deprecated:
             return None
-        else:
-            chosen_deployment = available_deployments[0]
+        elif len(available_deployments) > 0:
+            chosen_deployment = available_deployments[-1]
             if warn_arg_deprecated:
-                hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
+                hwarn(f"All model deployments for model {model_name} are deprecated.")
+        else:
+            return None
         if warn_arg_deprecated:
             hlog(
-                f"Choosing {chosen_deployment.name} (the first one) as "
+                f"Choosing {chosen_deployment.name} (the last one) as "
                 f"the default model deployment for model {model_name}"
             )
             hlog("If you want to use a different model deployment, please specify it explicitly.")

helm/benchmark/presentation/contamination.py CHANGED Viewed

@@ -4,7 +4,7 @@ import dacite
 import importlib_resources as resources
 import yaml
-from helm.common.hierarchical_logger import htrack, hlog
+from helm.common.hierarchical_logger import htrack, hlog, hwarn
 from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
 from helm.benchmark.presentation.schema import Schema
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
     for point in contamination.points:
         for model in point.models:
             if model not in MODEL_NAME_TO_MODEL_METADATA:
-                hlog(f"WARNING: model {model} not defined in schema")
+                hwarn(f"model {model} not defined in schema")
         for group in point.groups:
             if group not in schema.name_to_run_group:
-                hlog(f"WARNING: group {group} not defined in schema")
+                hwarn(f"group {group} not defined in schema")
 def read_contamination():

helm/benchmark/presentation/create_plots.py CHANGED Viewed

@@ -1,4 +1,7 @@
-# mypy: check_untyped_defs = False
+# type: ignore
+# flake8: noqa
+# fmt: off
 import argparse
 from collections import defaultdict
 from dataclasses import dataclass
@@ -11,7 +14,7 @@ import numpy as np
 from scipy.stats import pearsonr
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, setup_default_logging
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
@@ -600,17 +603,7 @@ class Plotter:
         self.create_constrast_set_plots()
-def main():
-    """
-    This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
-    It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
-    the top-level command `helm-create-plots`.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
-    parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
-    parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
-    args = parser.parse_args()
+def create_plots(args):
     register_builtin_configs_from_helm_package()
     base_path = os.path.join(args.output_path, "runs", args.suite)
     if not os.path.exists(os.path.join(base_path, "groups")):
@@ -621,5 +614,42 @@ def main():
     plotter.create_all_plots()
+def main():
+    """
+    This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
+    It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
+    the top-level command `helm-create-plots`.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        help="Path to benchmarking output",
+        default="benchmark_output",
+    )
+    parser.add_argument(
+        "--suite",
+        type=str,
+        help="Name of the suite that we are plotting",
+        required=True,
+    )
+    parser.add_argument(
+        "--plot-format",
+        help="Format for saving plots",
+        default="png",
+        choices=["png", "pdf"],
+    )
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
+    )
+    args = parser.parse_args()
+    setup_default_logging(args.log_config)
+    create_plots(args)
 if __name__ == "__main__":
     main()

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -59,6 +59,9 @@ class DisplayPrediction:
     annotations: Optional[Dict[str, Any]]
+    thinking_text: Optional[str]
+    """Thinking text from thinking models."""
 @dataclass(frozen=True)
 class DisplayRequest:
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             request_state.instance
         )
+        if request_state.result.completions[0].multimodal_content:
+            additional_prediction: str = request_state.result.completions[0].multimodal_content.text
+            if additional_prediction:
+                predicted_text = f"{additional_prediction} {predicted_text}"
         # Process images and include if they exist
         images: List[str] = [
             encode_base64(image_location)
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             if os.path.exists(image_location)
         ]
+        thinking_text: Optional[str] = (
+            request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
+        )
         predictions.append(
             DisplayPrediction(
                 instance_id=request_state.instance.id,
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
                 reference_index=request_state.reference_index,
                 stats=trial_stats,
                 annotations=request_state.annotations,
+                thinking_text=thinking_text,
             )
         )
         requests.append(

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -11,6 +11,7 @@ import importlib_resources as resources
 from helm.common.general import hlog
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
+from helm.common.hierarchical_logger import hwarn
 # TODO: change to `helm.benchmark.config`
@@ -204,6 +205,11 @@ class RunGroup(Field):
     # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore.
     adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"])
+    # Optional short description of the run group.
+    # This description is used in some space-constrained places in frontend tables.
+    # If unset, the description field will be used instead.
+    short_description: Optional[str] = None
 @dataclass
 class Schema:
@@ -281,5 +287,5 @@ def read_schema(schema_path: str) -> Schema:
         raw = yaml.safe_load(f)
     schema = dacite.from_dict(Schema, raw)
     if schema.adapter:
-        hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
+        hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
     return dataclasses.replace(schema, adapter=get_adapter_fields())

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -30,7 +30,7 @@ from helm.common.general import (
     unique_simplification,
 )
 from helm.common.codec import from_json
-from helm.common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.metrics.metric_name import MetricName
@@ -102,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
         # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
         # as the main metric but multiple_choice_separate_original only generates exact_match
         if matcher.name == "quasi_exact_match":
-            hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
+            hwarn("No quasi_exact_match metric found, looking for exact_match instead")
             matcher = replace(matcher, name="exact_match")
             matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
             if len(matching_stats) == 0:
@@ -294,7 +294,6 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
 class AggregationStrategy:
-    # TODO: Convert to StrEnum after upgrading to Python 3.11
     WIN_RATE = "win_rate"
     MEAN = "mean"
@@ -406,8 +405,8 @@ class Summarizer:
                 included = False
             for run_group_name in run.run_spec.groups:  # go through the groups of the run to determine visibility
                 if run_group_name not in self.schema.name_to_run_group:
-                    hlog(
-                        f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
+                    hwarn(
+                        f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
                         f"but undefined in {self.schema_path}, skipping"
                     )
                     continue
@@ -440,14 +439,14 @@ class Summarizer:
             run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
             stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
             if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
-                hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
+                hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
                 continue
             run_path: str = os.path.join(run_suite_path, run_dir_name)
             run = self.read_run(run_path)
             self.runs.append(run)
             if run.run_spec.name in self.runs_to_run_suites:
-                hlog(
-                    f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
+                hwarn(
+                    f"Run entry {run.run_spec.name} is present in two different Run Suites. "
                     f"Defaulting to the latest assigned suite: {suite}"
                 )
             self.runs_to_run_suites[run.run_spec.name] = suite
@@ -544,8 +543,8 @@ class Summarizer:
         for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
             if metric_name not in defined_metric_names:
-                hlog(
-                    f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
+                hwarn(
+                    f"metric name {metric_name} undefined in {self.schema_path} "
                     f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
                 )
@@ -738,8 +737,8 @@ class Summarizer:
             if stat is None:
                 # Print out near misses to provide a more informative warning
                 near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
-                hlog(
-                    f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
+                hwarn(
+                    f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
                     f"{len(near_misses)} near misses matching just the name"
                 )
                 if len(near_misses) > 0:
@@ -810,7 +809,7 @@ class Summarizer:
         # Create header (cells to display) and the list of metric name filters
         # (to pull out information later).
         if not columns or not adapter_to_runs:
-            hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
+            hwarn(f"table {title}, has no rows or columns, leaving empty")
             return Table("empty", [], [])
         header: List[HeaderCell] = []
@@ -831,7 +830,7 @@ class Summarizer:
                     matcher = replace(matcher, sub_split=sub_split)
                 header_field = self.schema.name_to_metric.get(matcher.name)
                 if header_field is None:
-                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
+                    hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
                     continue
                 metadata = {
                     "metric": header_field.get_short_display_name(),
@@ -839,7 +838,8 @@ class Summarizer:
                 }
                 header_name = header_field.get_short_display_name()
-                description = (run_group.description + "\n\n" if run_group.description is not None else "") + (
+                run_group_short_description = run_group.short_description or run_group.description or ""
+                description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
                     (header_field.display_name if header_field.display_name else header_field.name)
                     + ": "
                     + (header_field.description if header_field.description is not None else "")
@@ -959,8 +959,8 @@ class Summarizer:
             all_run_spec_names = []
             for adapter_spec, runs in adapter_to_runs.items():
                 if len(runs) > 1:
-                    hlog(
-                        f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
+                    hwarn(
+                        f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
                         f" {[run.run_spec.name for run in runs]}"
                     )
                 for run in runs:
@@ -1232,10 +1232,57 @@ class Summarizer:
 @htrack("summarize")
+def summarize(args):
+    release: Optional[str] = None
+    suites: Optional[str] = None
+    suite: Optional[str] = None
+    if args.suite and (args.release or args.suites):
+        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
+    elif args.suite:
+        # Comment this out while we have a trial period for the `release` method.
+        # hlog(
+        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
+        #     "where --release specifies the name of a release and --suites specifies several run suites "
+        #     "to be included in that release."
+        # )
+        suite = args.suite
+    elif args.release or args.suites:
+        if not args.release or not args.suites:
+            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
+        release = args.release
+        suites = args.suites
+    else:
+        raise ValueError("Exactly one of --release or --suite must be specified.")
+    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
+    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
+    summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
+    summarizer = summarizer_cls(
+        release=release,
+        suites=suites,
+        suite=suite,
+        schema_path=schema_path,
+        output_path=args.output_path,
+        verbose=args.debug,
+        num_threads=args.num_threads,
+        allow_unknown_models=args.allow_unknown_models,
+    )
+    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
+    hlog("Done.")
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
+        "-o",
+        "--output-path",
+        type=str,
+        help="Where the benchmarking output lives",
+        default="benchmark_output",
     )
     parser.add_argument(
         "--schema-path",
@@ -1253,9 +1300,18 @@ def main():
         help="Experimental: Name of the release this summarization should go under.",
     )
     parser.add_argument(
-        "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
+        "--suites",
+        type=str,
+        nargs="+",
+        help="Experimental: List of suites to summarize for this this release.",
+    )
+    parser.add_argument(
+        "-n",
+        "--num-threads",
+        type=int,
+        help="Max number of threads used to summarize",
+        default=8,
     )
-    parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -1284,48 +1340,15 @@ def main():
         default=None,
         help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
     )
-    args = parser.parse_args()
-    release: Optional[str] = None
-    suites: Optional[str] = None
-    suite: Optional[str] = None
-    if args.suite and (args.release or args.suites):
-        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
-    elif args.suite:
-        # Comment this out while we have a trial period for the `release` method.
-        # hlog(
-        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
-        #     "where --release specifies the name of a release and --suites specifies several run suites "
-        #     "to be included in that release."
-        # )
-        suite = args.suite
-    elif args.release or args.suites:
-        if not args.release or not args.suites:
-            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
-        release = args.release
-        suites = args.suites
-    else:
-        raise ValueError("Exactly one of --release or --suite must be specified.")
-    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
-    register_builtin_configs_from_helm_package()
-    register_configs_from_directory(args.local_path)
-    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
-    summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
-    summarizer = summarizer_cls(
-        release=release,
-        suites=suites,
-        suite=suite,
-        schema_path=schema_path,
-        output_path=args.output_path,
-        verbose=args.debug,
-        num_threads=args.num_threads,
-        allow_unknown_models=args.allow_unknown_models,
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
     )
-    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
-    hlog("Done.")
+    args = parser.parse_args()
+    setup_default_logging(args.log_config)
+    summarize(args)
 if __name__ == "__main__":

helm/benchmark/presentation/test_create_plots.py CHANGED Viewed

@@ -1,4 +1,7 @@
-# mypy: check_untyped_defs = False
+# type: ignore
+# flake8: noqa
+# fmt: off
 from helm.common.general import asdict_without_nones
 from helm.benchmark.presentation.table import Table, Cell, HeaderCell
 from helm.benchmark.presentation.create_plots import parse_table

helm/benchmark/reeval_run.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List
 from helm.benchmark import model_metadata_registry
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog, htrack
+from helm.common.hierarchical_logger import hlog, htrack, hwarn
 from helm.common.authentication import Authentication
 from helm.proxy.services.remote_service import create_authentication, add_service_args
@@ -191,9 +191,8 @@ def main():
     )
     if args.run_specs:
-        hlog(
-            "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
-            "Use --run-entries instead."
+        hwarn(
+            "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
         )
     hlog("Done.")

helm/benchmark/reeval_runner.py CHANGED Viewed

@@ -12,7 +12,7 @@ from datasets import load_dataset
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.common.general import ensure_directory_exists, write, asdict_without_nones
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
 from helm.common.cache import cache_stats
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -193,7 +193,7 @@ class REEvalRunner(Runner):
             difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
             prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
         except ValueError:
-            hlog(f"WARNING: no available difficulty for {split_name}, skipping")
+            hwarn(f"no available difficulty for {split_name}, skipping")
             return
         unasked_request_states: List[RequestState] = []
@@ -320,7 +320,7 @@ class REEvalRunner(Runner):
         metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
         for metric_name, count in metric_counts.items():
             if count > 1:
-                hlog(f"WARNING: duplicate metric name {metric_name}")
+                hwarn(f"duplicate metric name {metric_name}")
         # Print out the number of stats
         hlog(f"Generated {len(stats)} stats.")

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl