PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/model_deployment_registry.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 import cattrs
 import yaml
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, hwarn
 from helm.common.object_spec import ObjectSpec
 from helm.benchmark.model_metadata_registry import (
     ModelMetadata,
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
     try:
         model_metadata = get_model_metadata(model_name)
     except ValueError:
-        hlog(
-            f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
-        )
+        hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
         model_metadata = get_unknown_model_metadata(model_name)
         register_model_metadata(model_metadata)
     deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
         raise ValueError(f"Model deployment {name} not found")
     deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
     if deployment.deprecated and warn_deprecated:
-        hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated")
+        hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
     return deployment
@@ -182,7 +180,7 @@ def get_default_model_deployment_for_model(
         deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
         if deployment.deprecated and ignore_deprecated:
             if warn_arg_deprecated:
-                hlog(f"WARNING: Model deployment {model_name} is deprecated")
+                hwarn(f"Model deployment {model_name} is deprecated")
             return None
         return deployment.name
@@ -193,7 +191,7 @@ def get_default_model_deployment_for_model(
     if len(available_deployments) > 0:
         available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
         if warn_arg_deprecated:
-            hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.")
+            hwarn("Model name is deprecated. Please use the model deployment name instead.")
             hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
         # Additionally, if there is a non-deprecated deployment, use it.
@@ -210,7 +208,7 @@ def get_default_model_deployment_for_model(
         else:
             chosen_deployment = available_deployments[0]
             if warn_arg_deprecated:
-                hlog(f"WARNING: All model deployments for model {model_name} are deprecated.")
+                hwarn(f"All model deployments for model {model_name} are deprecated.")
         if warn_arg_deprecated:
             hlog(
                 f"Choosing {chosen_deployment.name} (the first one) as "

helm/benchmark/presentation/contamination.py CHANGED Viewed

@@ -4,7 +4,7 @@ import dacite
 import importlib_resources as resources
 import yaml
-from helm.common.hierarchical_logger import htrack, hlog
+from helm.common.hierarchical_logger import htrack, hlog, hwarn
 from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
 from helm.benchmark.presentation.schema import Schema
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
     for point in contamination.points:
         for model in point.models:
             if model not in MODEL_NAME_TO_MODEL_METADATA:
-                hlog(f"WARNING: model {model} not defined in schema")
+                hwarn(f"model {model} not defined in schema")
         for group in point.groups:
             if group not in schema.name_to_run_group:
-                hlog(f"WARNING: group {group} not defined in schema")
+                hwarn(f"group {group} not defined in schema")
 def read_contamination():

helm/benchmark/presentation/create_plots.py CHANGED Viewed

@@ -11,7 +11,7 @@ import numpy as np
 from scipy.stats import pearsonr
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, setup_default_logging
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
@@ -600,17 +600,7 @@ class Plotter:
         self.create_constrast_set_plots()
-def main():
-    """
-    This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
-    It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
-    the top-level command `helm-create-plots`.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
-    parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
-    parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
-    args = parser.parse_args()
+def create_plots(args):
     register_builtin_configs_from_helm_package()
     base_path = os.path.join(args.output_path, "runs", args.suite)
     if not os.path.exists(os.path.join(base_path, "groups")):
@@ -621,5 +611,36 @@ def main():
     plotter.create_all_plots()
+def main():
+    """
+    This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
+    It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
+    the top-level command `helm-create-plots`.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        help="Path to benchmarking output",
+        default="benchmark_output",
+    )
+    parser.add_argument(
+        "--suite",
+        type=str,
+        help="Name of the suite that we are plotting",
+        required=True,
+    )
+    parser.add_argument(
+        "--plot-format",
+        help="Format for saving plots",
+        default="png",
+        choices=["png", "pdf"],
+    )
+    args = parser.parse_args()
+    setup_default_logging()
+    create_plots(args)
 if __name__ == "__main__":
     main()

helm/benchmark/presentation/run_display.py CHANGED Viewed

@@ -59,6 +59,9 @@ class DisplayPrediction:
     annotations: Optional[Dict[str, Any]]
+    thinking_text: Optional[str]
+    """Thinking text from thinking models."""
 @dataclass(frozen=True)
 class DisplayRequest:
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             request_state.instance
         )
+        if request_state.result.completions[0].multimodal_content:
+            additional_prediction: str = request_state.result.completions[0].multimodal_content.text
+            if additional_prediction:
+                predicted_text = f"{additional_prediction} {predicted_text}"
         # Process images and include if they exist
         images: List[str] = [
             encode_base64(image_location)
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
             if os.path.exists(image_location)
         ]
+        thinking_text: Optional[str] = (
+            request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
+        )
         predictions.append(
             DisplayPrediction(
                 instance_id=request_state.instance.id,
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
                 reference_index=request_state.reference_index,
                 stats=trial_stats,
                 annotations=request_state.annotations,
+                thinking_text=thinking_text,
             )
         )
         requests.append(

helm/benchmark/presentation/schema.py CHANGED Viewed

@@ -11,6 +11,7 @@ import importlib_resources as resources
 from helm.common.general import hlog
 from helm.benchmark.metrics.metric_name import MetricName
 from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
+from helm.common.hierarchical_logger import hwarn
 # TODO: change to `helm.benchmark.config`
@@ -281,5 +282,5 @@ def read_schema(schema_path: str) -> Schema:
         raw = yaml.safe_load(f)
     schema = dacite.from_dict(Schema, raw)
     if schema.adapter:
-        hlog(f"WARNING: The `adapter` field is deprecated and should be removed from schema file {schema_path}")
+        hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
     return dataclasses.replace(schema, adapter=get_adapter_fields())

helm/benchmark/presentation/summarize.py CHANGED Viewed

@@ -30,7 +30,7 @@ from helm.common.general import (
     unique_simplification,
 )
 from helm.common.codec import from_json
-from helm.common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from helm.benchmark.metrics.metric_name import MetricName
@@ -102,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
         # This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
         # as the main metric but multiple_choice_separate_original only generates exact_match
         if matcher.name == "quasi_exact_match":
-            hlog("WARNING: No quasi_exact_match metric found, looking for exact_match instead")
+            hwarn("No quasi_exact_match metric found, looking for exact_match instead")
             matcher = replace(matcher, name="exact_match")
             matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
             if len(matching_stats) == 0:
@@ -406,8 +406,8 @@ class Summarizer:
                 included = False
             for run_group_name in run.run_spec.groups:  # go through the groups of the run to determine visibility
                 if run_group_name not in self.schema.name_to_run_group:
-                    hlog(
-                        f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
+                    hwarn(
+                        f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
                         f"but undefined in {self.schema_path}, skipping"
                     )
                     continue
@@ -440,14 +440,14 @@ class Summarizer:
             run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
             stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
             if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
-                hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
+                hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
                 continue
             run_path: str = os.path.join(run_suite_path, run_dir_name)
             run = self.read_run(run_path)
             self.runs.append(run)
             if run.run_spec.name in self.runs_to_run_suites:
-                hlog(
-                    f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
+                hwarn(
+                    f"Run entry {run.run_spec.name} is present in two different Run Suites. "
                     f"Defaulting to the latest assigned suite: {suite}"
                 )
             self.runs_to_run_suites[run.run_spec.name] = suite
@@ -544,8 +544,8 @@ class Summarizer:
         for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
             if metric_name not in defined_metric_names:
-                hlog(
-                    f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
+                hwarn(
+                    f"metric name {metric_name} undefined in {self.schema_path} "
                     f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
                 )
@@ -738,8 +738,8 @@ class Summarizer:
             if stat is None:
                 # Print out near misses to provide a more informative warning
                 near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
-                hlog(
-                    f"WARNING: run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
+                hwarn(
+                    f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
                     f"{len(near_misses)} near misses matching just the name"
                 )
                 if len(near_misses) > 0:
@@ -810,7 +810,7 @@ class Summarizer:
         # Create header (cells to display) and the list of metric name filters
         # (to pull out information later).
         if not columns or not adapter_to_runs:
-            hlog(f"WARNING: table {title}, has no rows or columns, leaving empty")
+            hwarn(f"table {title}, has no rows or columns, leaving empty")
             return Table("empty", [], [])
         header: List[HeaderCell] = []
@@ -831,7 +831,7 @@ class Summarizer:
                     matcher = replace(matcher, sub_split=sub_split)
                 header_field = self.schema.name_to_metric.get(matcher.name)
                 if header_field is None:
-                    hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
+                    hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
                     continue
                 metadata = {
                     "metric": header_field.get_short_display_name(),
@@ -959,8 +959,8 @@ class Summarizer:
             all_run_spec_names = []
             for adapter_spec, runs in adapter_to_runs.items():
                 if len(runs) > 1:
-                    hlog(
-                        f"WARNING: table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
+                    hwarn(
+                        f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
                         f" {[run.run_spec.name for run in runs]}"
                     )
                 for run in runs:
@@ -1232,10 +1232,57 @@ class Summarizer:
 @htrack("summarize")
+def summarize(args):
+    release: Optional[str] = None
+    suites: Optional[str] = None
+    suite: Optional[str] = None
+    if args.suite and (args.release or args.suites):
+        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
+    elif args.suite:
+        # Comment this out while we have a trial period for the `release` method.
+        # hlog(
+        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
+        #     "where --release specifies the name of a release and --suites specifies several run suites "
+        #     "to be included in that release."
+        # )
+        suite = args.suite
+    elif args.release or args.suites:
+        if not args.release or not args.suites:
+            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
+        release = args.release
+        suites = args.suites
+    else:
+        raise ValueError("Exactly one of --release or --suite must be specified.")
+    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
+    register_builtin_configs_from_helm_package()
+    register_configs_from_directory(args.local_path)
+    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
+    summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
+    summarizer = summarizer_cls(
+        release=release,
+        suites=suites,
+        suite=suite,
+        schema_path=schema_path,
+        output_path=args.output_path,
+        verbose=args.debug,
+        num_threads=args.num_threads,
+        allow_unknown_models=args.allow_unknown_models,
+    )
+    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
+    hlog("Done.")
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
+        "-o",
+        "--output-path",
+        type=str,
+        help="Where the benchmarking output lives",
+        default="benchmark_output",
     )
     parser.add_argument(
         "--schema-path",
@@ -1253,9 +1300,18 @@ def main():
         help="Experimental: Name of the release this summarization should go under.",
     )
     parser.add_argument(
-        "--suites", type=str, nargs="+", help="Experimental: List of suites to summarize for this this release."
+        "--suites",
+        type=str,
+        nargs="+",
+        help="Experimental: List of suites to summarize for this this release.",
+    )
+    parser.add_argument(
+        "-n",
+        "--num-threads",
+        type=int,
+        help="Max number of threads used to summarize",
+        default=8,
     )
-    parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
     parser.add_argument(
         "--debug",
         action="store_true",
@@ -1285,47 +1341,8 @@ def main():
         help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
     )
     args = parser.parse_args()
-    release: Optional[str] = None
-    suites: Optional[str] = None
-    suite: Optional[str] = None
-    if args.suite and (args.release or args.suites):
-        raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
-    elif args.suite:
-        # Comment this out while we have a trial period for the `release` method.
-        # hlog(
-        #     "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
-        #     "where --release specifies the name of a release and --suites specifies several run suites "
-        #     "to be included in that release."
-        # )
-        suite = args.suite
-    elif args.release or args.suites:
-        if not args.release or not args.suites:
-            raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
-        release = args.release
-        suites = args.suites
-    else:
-        raise ValueError("Exactly one of --release or --suite must be specified.")
-    schema_path = args.schema_path if args.schema_path else get_default_schema_path()
-    register_builtin_configs_from_helm_package()
-    register_configs_from_directory(args.local_path)
-    # Output JSON files summarizing the benchmark results which will be loaded in the web interface
-    summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
-    summarizer = summarizer_cls(
-        release=release,
-        suites=suites,
-        suite=suite,
-        schema_path=schema_path,
-        output_path=args.output_path,
-        verbose=args.debug,
-        num_threads=args.num_threads,
-        allow_unknown_models=args.allow_unknown_models,
-    )
-    summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
-    hlog("Done.")
+    setup_default_logging()
+    summarize(args)
 if __name__ == "__main__":

helm/benchmark/reeval_run.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List
 from helm.benchmark import model_metadata_registry
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog, htrack
+from helm.common.hierarchical_logger import hlog, htrack, hwarn
 from helm.common.authentication import Authentication
 from helm.proxy.services.remote_service import create_authentication, add_service_args
@@ -191,9 +191,8 @@ def main():
     )
     if args.run_specs:
-        hlog(
-            "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
-            "Use --run-entries instead."
+        hwarn(
+            "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
         )
     hlog("Done.")

helm/benchmark/reeval_runner.py CHANGED Viewed

@@ -12,7 +12,7 @@ from datasets import load_dataset
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.common.general import ensure_directory_exists, write, asdict_without_nones
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack_block, hwarn
 from helm.common.cache import cache_stats
 from helm.benchmark.scenarios.scenario import (
     Scenario,
@@ -193,7 +193,7 @@ class REEvalRunner(Runner):
             difficulty_dataset = load_dataset("stair-lab/reeval-difficulty", split=split_name)
             prompt_to_difficulty: dict[str, float] = {row["request.prompt"]: row["z"] for row in difficulty_dataset}
         except ValueError:
-            hlog(f"WARNING: no available difficulty for {split_name}, skipping")
+            hwarn(f"no available difficulty for {split_name}, skipping")
             return
         unasked_request_states: List[RequestState] = []
@@ -320,7 +320,7 @@ class REEvalRunner(Runner):
         metric_counts: typing.Counter[MetricName] = Counter([stat.name for stat in stats])
         for metric_name, count in metric_counts.items():
             if count > 1:
-                hlog(f"WARNING: duplicate metric name {metric_name}")
+                hwarn(f"duplicate metric name {metric_name}")
         # Print out the number of stats
         hlog(f"Generated {len(stats)} stats.")

helm/benchmark/run.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
 from helm.common.authentication import Authentication
 from helm.common.object_spec import parse_object_spec, get_class_by_name
 from helm.proxy.services.remote_service import create_authentication, add_service_args
@@ -200,76 +200,9 @@ def validate_args(args):
 @htrack(None)
-def main():
-    parser = argparse.ArgumentParser()
-    add_service_args(parser)
-    parser.add_argument(
-        "-c",
-        "--conf-paths",
-        nargs="+",
-        help="Where to read RunSpecs to run from",
-        default=[],
-    )
-    parser.add_argument(
-        "--models-to-run",
-        nargs="+",
-        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
-        default=None,
-    )
-    parser.add_argument(
-        "--groups-to-run",
-        nargs="+",
-        help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
-        default=None,
-    )
-    parser.add_argument(
-        "--exit-on-error",
-        action="store_true",
-        help="Fail and exit immediately if a particular RunSpec fails.",
-    )
-    parser.add_argument(
-        "--skip-completed-runs",
-        action="store_true",
-        help="Skip RunSpecs that have completed i.e. output files exists.",
-    )
-    parser.add_argument(
-        "--priority",
-        type=int,
-        default=None,
-        help="Run RunSpecs with priority less than or equal to this number. "
-        "If a value for --priority is not specified, run on everything",
-    )
-    parser.add_argument(
-        "--run-specs",
-        nargs="*",
-        help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
-        "Specifies run entries to run.",
-        default=[],
-    )
-    parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
-    parser.add_argument(
-        "--enable-huggingface-models",
-        nargs="+",
-        default=[],
-        help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
-        "Format: namespace/model_name[@revision]",
-    )
-    parser.add_argument(
-        "--enable-local-huggingface-models",
-        nargs="+",
-        default=[],
-        help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
-    )
-    parser.add_argument(
-        "--runner-class-name",
-        type=str,
-        default=None,
-        help="Full class name of the Runner class to use. If unset, uses the default Runner.",
-    )
-    add_run_args(parser)
-    args = parser.parse_args()
-    validate_args(args)
+def helm_run(args):
+    validate_args(args)
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)
@@ -358,13 +291,85 @@ def main():
     )
     if args.run_specs:
-        hlog(
-            "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
-            "Use --run-entries instead."
+        hwarn(
+            "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
         )
     hlog("Done.")
+# Separate parsing from starting HELM so we can setup logging
+def main():
+    parser = argparse.ArgumentParser()
+    add_service_args(parser)
+    parser.add_argument(
+        "-c",
+        "--conf-paths",
+        nargs="+",
+        help="Where to read RunSpecs to run from",
+        default=[],
+    )
+    parser.add_argument(
+        "--models-to-run",
+        nargs="+",
+        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
+        default=None,
+    )
+    parser.add_argument(
+        "--groups-to-run",
+        nargs="+",
+        help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
+        default=None,
+    )
+    parser.add_argument(
+        "--exit-on-error",
+        action="store_true",
+        help="Fail and exit immediately if a particular RunSpec fails.",
+    )
+    parser.add_argument(
+        "--skip-completed-runs",
+        action="store_true",
+        help="Skip RunSpecs that have completed i.e. output files exists.",
+    )
+    parser.add_argument(
+        "--priority",
+        type=int,
+        default=None,
+        help="Run RunSpecs with priority less than or equal to this number. "
+        "If a value for --priority is not specified, run on everything",
+    )
+    parser.add_argument(
+        "--run-specs",
+        nargs="*",
+        help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
+        "Specifies run entries to run.",
+        default=[],
+    )
+    parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
+    parser.add_argument(
+        "--enable-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
+        "Format: namespace/model_name[@revision]",
+    )
+    parser.add_argument(
+        "--enable-local-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
+    )
+    parser.add_argument(
+        "--runner-class-name",
+        type=str,
+        default=None,
+        help="Full class name of the Runner class to use. If unset, uses the default Runner.",
+    )
+    add_run_args(parser)
+    args = parser.parse_args()
+    setup_default_logging()
+    return helm_run(args)
 if __name__ == "__main__":
     main()

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl