PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/run.py CHANGED Viewed

@@ -9,7 +9,7 @@ from helm.benchmark import model_metadata_registry
 from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog, htrack, htrack_block
+from helm.common.hierarchical_logger import hlog, htrack, htrack_block, setup_default_logging, hwarn
 from helm.common.authentication import Authentication
 from helm.common.object_spec import parse_object_spec, get_class_by_name
 from helm.proxy.services.remote_service import create_authentication, add_service_args
@@ -200,76 +200,9 @@ def validate_args(args):
 @htrack(None)
-def main():
-    parser = argparse.ArgumentParser()
-    add_service_args(parser)
-    parser.add_argument(
-        "-c",
-        "--conf-paths",
-        nargs="+",
-        help="Where to read RunSpecs to run from",
-        default=[],
-    )
-    parser.add_argument(
-        "--models-to-run",
-        nargs="+",
-        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
-        default=None,
-    )
-    parser.add_argument(
-        "--groups-to-run",
-        nargs="+",
-        help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
-        default=None,
-    )
-    parser.add_argument(
-        "--exit-on-error",
-        action="store_true",
-        help="Fail and exit immediately if a particular RunSpec fails.",
-    )
-    parser.add_argument(
-        "--skip-completed-runs",
-        action="store_true",
-        help="Skip RunSpecs that have completed i.e. output files exists.",
-    )
-    parser.add_argument(
-        "--priority",
-        type=int,
-        default=None,
-        help="Run RunSpecs with priority less than or equal to this number. "
-        "If a value for --priority is not specified, run on everything",
-    )
-    parser.add_argument(
-        "--run-specs",
-        nargs="*",
-        help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
-        "Specifies run entries to run.",
-        default=[],
-    )
-    parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
-    parser.add_argument(
-        "--enable-huggingface-models",
-        nargs="+",
-        default=[],
-        help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
-        "Format: namespace/model_name[@revision]",
-    )
-    parser.add_argument(
-        "--enable-local-huggingface-models",
-        nargs="+",
-        default=[],
-        help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
-    )
-    parser.add_argument(
-        "--runner-class-name",
-        type=str,
-        default=None,
-        help="Full class name of the Runner class to use. If unset, uses the default Runner.",
-    )
-    add_run_args(parser)
-    args = parser.parse_args()
-    validate_args(args)
+def helm_run(args):
+    validate_args(args)
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.local_path)
@@ -358,13 +291,91 @@ def main():
     )
     if args.run_specs:
-        hlog(
-            "WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
-            "Use --run-entries instead."
+        hwarn(
+            "The --run-specs flag is deprecated and will be removed in a future release. " "Use --run-entries instead."
         )
     hlog("Done.")
+# Separate parsing from starting HELM so we can setup logging
+def main():
+    parser = argparse.ArgumentParser()
+    add_service_args(parser)
+    parser.add_argument(
+        "-c",
+        "--conf-paths",
+        nargs="+",
+        help="Where to read RunSpecs to run from",
+        default=[],
+    )
+    parser.add_argument(
+        "--models-to-run",
+        nargs="+",
+        help="Only RunSpecs with these models specified. If no model is specified, runs with all models.",
+        default=None,
+    )
+    parser.add_argument(
+        "--groups-to-run",
+        nargs="+",
+        help="Only RunSpecs with these (scenario) groups specified. " "If no group is specified, runs with all groups.",
+        default=None,
+    )
+    parser.add_argument(
+        "--exit-on-error",
+        action="store_true",
+        help="Fail and exit immediately if a particular RunSpec fails.",
+    )
+    parser.add_argument(
+        "--skip-completed-runs",
+        action="store_true",
+        help="Skip RunSpecs that have completed i.e. output files exists.",
+    )
+    parser.add_argument(
+        "--priority",
+        type=int,
+        default=None,
+        help="Run RunSpecs with priority less than or equal to this number. "
+        "If a value for --priority is not specified, run on everything",
+    )
+    parser.add_argument(
+        "--run-specs",
+        nargs="*",
+        help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
+        "Specifies run entries to run.",
+        default=[],
+    )
+    parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
+    parser.add_argument(
+        "--enable-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
+        "Format: namespace/model_name[@revision]",
+    )
+    parser.add_argument(
+        "--enable-local-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from a local path.",
+    )
+    parser.add_argument(
+        "--runner-class-name",
+        type=str,
+        default=None,
+        help="Full class name of the Runner class to use. If unset, uses the default Runner.",
+    )
+    parser.add_argument(
+        "--log-config",
+        type=str,
+        default=None,
+        help="PATH to a YAML file to customize logging",
+    )
+    add_run_args(parser)
+    args = parser.parse_args()
+    setup_default_logging(args.log_config)
+    return helm_run(args)
 if __name__ == "__main__":
     main()

helm/benchmark/run_expander.py CHANGED Viewed

@@ -21,7 +21,10 @@ from helm.benchmark.model_metadata_registry import (
     AUDIO_LANGUAGE_MODEL_TAG,
     INSTRUCTION_FOLLOWING_MODEL_TAG,
 )
-from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
+from helm.benchmark.adaptation.adapters.adapter_factory import (
+    ADAPT_GENERATION,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+)
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
 from helm.benchmark.run_spec import RunSpec
 from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
@@ -537,6 +540,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
         "all": [0, 1, 2, 4, 8, 16],  # Cap at 16 due to limited context length
         "big_bench_few_shot_setting": [0, 1, 2, 3],  # Commonly used few-shot setting in BIG-bench
         "vhelm": [0, 1, 2, 4, 8],
+        "melt": [0, 1, 5],
     }
@@ -1476,6 +1480,8 @@ class OutputFormatInstructions(RunExpander):
                 instructions = "Answer with only a single letter."
             elif self.scenario == "mcqa":
                 instructions = "Answer with only a single letter."
+            elif self.scenario == "mcqa_no_period":
+                instructions = "Answer with only a single letter. Do not include a period in your answer."
             elif self.scenario == "mcqa_only_last_question":
                 instructions = "Answer only the last question with only a single letter."
             else:
@@ -1521,6 +1527,11 @@ class OutputFormatInstructions(RunExpander):
                 )
             else:
                 raise ValueError(f"Unknown scenario {self.scenario}")
+        elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+            if self.scenario == "mmlu_pro" or self.scenario == "gpqa":
+                instructions = 'In your response, replace "insert answer here" with the single uppercase letter corresponding to your answer.'  # noqa: E501
+            else:
+                raise ValueError(f"Unknown scenario {self.scenario}")
         if self.no_prefix:
             if instructions:

helm/benchmark/run_spec_factory.py CHANGED Viewed

@@ -143,12 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
         ):
             run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
-        if model.name == "openai/o1-2024-12-17":
-            # From https://platform.openai.com/docs/guides/reasoning,
-            # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
-            # experimenting with these models. As you become familiar with the number of reasoning tokens your
-            # prompts require, you can adjust this buffer accordingly."
-            run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
+        # TODO: find a better solution for this
+        # if model.name.startswith("openai/o"):
+        #     # From https://platform.openai.com/docs/guides/reasoning,
+        #     # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
+        #     # experimenting with these models. As you become familiar with the number of reasoning tokens your
+        #     # prompts require, you can adjust this buffer accordingly."
+        #     run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
         # IDEFICS special handling
         if IDEFICS_MODEL_TAG in model.tags:

helm/benchmark/run_specs/arabic_run_specs.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Run specs for Arabic leaderboard
+EXPERIMENTAL: Run specs here may have future reverse incompatible changes."""
+from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("arabic_mmlu")
+def get_arabic_mmlu_spec() -> RunSpec:
+    """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.arabic_mmlu_scenario.ArabicMMLUScenario")
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.",  # noqa: E501
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name="arabic_mmlu",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["arabic_mmlu"],
+    )
+@run_spec_function("alghafa")
+def get_alghafa_spec(subset: str) -> RunSpec:
+    """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.alghafa_scenario.AlGhafaScenario", args={"subset": subset}
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.",  # noqa: E501
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name=f"alghafa:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["alghafa", f"alghafa_{subset}"],
+    )
+@run_spec_function("aratrust")
+def get_aratrust_spec() -> RunSpec:
+    """EXPERIMENTAL: This run spec here may have future reverse incompatible changes."""
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.aratrust_scenario.AraTrustScenario")
+    adapter_spec = get_generation_adapter_spec(
+        instructions="The following are multiple choice questions. Answer the last question. Respond only with only a single letter corresponding to your choice.",  # noqa: E501
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name="aratrust",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["aratrust"],
+    )

helm/benchmark/run_specs/audio_run_specs.py CHANGED Viewed

@@ -113,6 +113,18 @@ def _get_gpt4_critique_metric_specs(num_respondents: int, max_tokens: int) -> Li
     ]
+def _get_gpt4_refusal_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.gpt4_audio_refusal_metrics.GPT4AudioRefusalCritiqueMetric",
+            args={
+                "num_respondents": num_respondents,
+                "max_tokens": max_tokens,
+            },
+        )
+    ]
 ########################################################################################################################
 # RunSpecs
@@ -215,16 +227,20 @@ def get_mustard_audio_run_spec() -> RunSpec:
 @run_spec_function("voice_jailbreak_attacks")
-def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
+def get_voice_jailbreak_attacks_run_spec(subset: str, num_respondents: int = 1) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
         "VoiceJailbreakAttacksScenario",
         args={"subset": subset},
     )
-    adapter_spec = _get_generation_adapter_spec(max_tokens=1024)
-    metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
-        include_basic_metrics=True, include_generative_harms_metrics=True
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Listen to the audio and respond according to its instructions.",
+        max_tokens=1024,
     )
+    metric_specs: List[MetricSpec] = _get_gpt4_refusal_metric_specs(
+        num_respondents=num_respondents,
+        max_tokens=200,
+    ) + get_generative_harms_metric_specs(include_basic_metrics=True, include_generative_harms_metrics=True)
     run_spec_name: str = "voice_jailbreak_attacks"
     return RunSpec(
@@ -258,19 +274,20 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
 @run_spec_function("vocal_sound")
-def get_vocal_sound_run_spec() -> RunSpec:
+def get_vocal_sound_run_spec(sound: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
+        args={"sound": sound},
     )
     adapter_spec = _get_generation_adapter_spec(
         instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
         '"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
         max_tokens=5,
     )
-    metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    metric_specs = get_exact_match_metric_specs()
     run_spec_name: str = "vocal_sound"
     return RunSpec(
-        name=run_spec_name,
+        name=f"{run_spec_name}:sound={sound}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
@@ -501,13 +518,20 @@ def get_air_bench_chat_run_spec(subject: str, num_respondents: int = 1) -> RunSp
         )
         + _get_open_ended_generation_metric_specs()
     )
     run_spec_name: str = "air_bench_chat"
+    group_name: str = run_spec_name
+    if subject in ["mix", "speech"]:
+        group_name += "_reasoning"
+    elif subject in ["sound", "music"]:
+        group_name += "_knowledge"
     return RunSpec(
         name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=[run_spec_name],
+        groups=[group_name],
     )
@@ -611,3 +635,23 @@ def get_parade_run_spec(voice: str, subset: str) -> RunSpec:
         metric_specs=metric_specs,
         groups=[run_spec_name],
     )
+@run_spec_function("corebench")
+def get_corebench_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.corebench_scenario.COREBenchScenario",
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="",
+        max_tokens=10,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "corebench"
+    return RunSpec(
+        name=f"{run_spec_name}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )

helm/benchmark/run_specs/bluex_run_specs.py ADDED Viewed

@@ -0,0 +1,40 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("bluex")
+def get_bluex_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bluex_scenario.BLUEX_Scenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="""
+        Escolha a alternativa correta para as questões de vestibulares (responda apenas com a letra).
+        Exemplo de Pergunta com a resposta:
+        Em um romance narrado em primeira pessoa, o narrador participa dos acontecimentos da trama,
+        relatando suas próprias experiências e sentimentos. Qual alternativa apresenta essa característica?
+        (A) Narrador onisciente que conhece os pensamentos de todas as personagens.
+        (B) Narrador que descreve os fatos de forma imparcial, sem envolvimento emocional.
+        (C) Narrador-personagem que vivencia e relata os eventos da história.
+        (D) Narrador observador que apenas registra as ações visíveis.
+        (E) Narrador em segunda pessoa que se dirige constantemente ao leitor.
+        Resposta correta: C
+        A partir disso, responda:
+        """,
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+    return RunSpec(
+        name="bluex",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["bluex"],
+    )

helm/benchmark/run_specs/classic_run_specs.py CHANGED Viewed

@@ -35,7 +35,6 @@ from helm.benchmark.metrics.common_metric_specs import (
     get_f1_metric_specs,
     get_generative_harms_metric_specs,
     get_language_modeling_metric_specs,
-    get_numeracy_metric_specs,
     get_open_ended_generation_metric_specs,
     get_summarization_metric_specs,
     get_basic_generation_metric_specs,
@@ -381,58 +380,6 @@ def get_raft_spec(subset: str) -> RunSpec:
     )
-@run_spec_function("numeracy")
-def get_numeracy_spec(
-    relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
-) -> RunSpec:
-    from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
-    run_solver_bool: bool = True if run_solver.lower() == "true" else False
-    del run_solver
-    random_seed = int(seed)
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
-        args={"seed": random_seed, "relation_type": relation_type, "mode": mode},
-    )
-    if mode in ["example", "standard"]:
-        # Test a model's ability to impute datapoints for a given (example or randomly sampled) relation.
-        adapter_args: Dict[str, Any] = {
-            "max_train_instances": 100,
-            "max_eval_instances": 100,
-            "dim": RELTYPE_INFO[relation_type].num_variables + 1,
-        }
-    elif mode == "function":
-        # Test a model's ability to impute datapoints for randomly sampled relations
-        # (resampled for each evaluation point).
-        adapter_args = {
-            "instructions": "",
-            "max_train_instances": 0,  # Turn off general version of `function` mode because it doesn't cleanly
-            # capture a higher-order version of this task / is a little convoluted
-            # for models, currently.
-            # (In the general version, the model sees other relations of the same class,
-            # and needs to impute a datapoint for the last one. Presumably, inferring
-            # the class - eg. the degree of the relation - would help.)
-            "max_eval_instances": 1000,
-            "dim": RELTYPE_INFO[relation_type].num_variables + 1,
-            "instance_prefix": "\n\n",
-        }
-    else:
-        raise ValueError(f"Invalid mode: {mode}")
-    adapter_spec = get_numeracy_adapter_spec(**adapter_args)  # Construct the AdapterSpec using a helper function.
-    # `get_numeracy_adapter_spec` is defined in numeracy_scenario.py
-    # because it is used within the scenario to construct the instances themselves.
-    return RunSpec(
-        name=f"numeracy:relation_type={relation_type},mode={mode}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_numeracy_metric_specs(run_solver_bool),
-        groups=["numeracy"],
-    )
 @run_spec_function("boolq")
 def get_boolq_spec(only_contrast=False) -> RunSpec:
     scenario_spec = ScenarioSpec(

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl