PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (206) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +1 -2
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +76 -59
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +78 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/long_context_run_specs.py +67 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/numeracy_scenario.py +2 -1
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +63 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +100 -54
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/together_client.py +31 -4
helm/clients/vertexai_client.py +6 -0
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/local_context.py +140 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/config/model_deployments.yaml +864 -193
helm/config/model_metadata.yaml +667 -53
helm/config/tokenizer_configs.yaml +144 -3
helm/proxy/cli.py +3 -1
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/benchmark/run_expander.py CHANGED Viewed

@@ -21,7 +21,10 @@ from helm.benchmark.model_metadata_registry import (
     AUDIO_LANGUAGE_MODEL_TAG,
     INSTRUCTION_FOLLOWING_MODEL_TAG,
 )
-from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
+from helm.benchmark.adaptation.adapters.adapter_factory import (
+    ADAPT_GENERATION,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+)
 from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
 from helm.benchmark.run_spec import RunSpec
 from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
@@ -537,6 +540,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
         "all": [0, 1, 2, 4, 8, 16],  # Cap at 16 due to limited context length
         "big_bench_few_shot_setting": [0, 1, 2, 3],  # Commonly used few-shot setting in BIG-bench
         "vhelm": [0, 1, 2, 4, 8],
+        "melt": [0, 1, 5],
     }
@@ -1476,6 +1480,8 @@ class OutputFormatInstructions(RunExpander):
                 instructions = "Answer with only a single letter."
             elif self.scenario == "mcqa":
                 instructions = "Answer with only a single letter."
+            elif self.scenario == "mcqa_no_period":
+                instructions = "Answer with only a single letter. Do not include a period in your answer."
             elif self.scenario == "mcqa_only_last_question":
                 instructions = "Answer only the last question with only a single letter."
             else:
@@ -1521,6 +1527,11 @@ class OutputFormatInstructions(RunExpander):
                 )
             else:
                 raise ValueError(f"Unknown scenario {self.scenario}")
+        elif run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT:
+            if self.scenario == "mmlu_pro" or self.scenario == "gpqa":
+                instructions = 'In your response, replace "insert answer here" with the single uppercase letter corresponding to your answer.'  # noqa: E501
+            else:
+                raise ValueError(f"Unknown scenario {self.scenario}")
         if self.no_prefix:
             if instructions:

helm/benchmark/run_spec_factory.py CHANGED Viewed

@@ -143,12 +143,13 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
         ):
             run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
-        if model.name == "openai/o1-2024-12-17":
-            # From https://platform.openai.com/docs/guides/reasoning,
-            # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
-            # experimenting with these models. As you become familiar with the number of reasoning tokens your
-            # prompts require, you can adjust this buffer accordingly."
-            run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
+        # TODO: find a better solution for this
+        # if model.name.startswith("openai/o"):
+        #     # From https://platform.openai.com/docs/guides/reasoning,
+        #     # "OpenAI recommends reserving at least 25,000 tokens for reasoning and outputs when you start
+        #     # experimenting with these models. As you become familiar with the number of reasoning tokens your
+        #     # prompts require, you can adjust this buffer accordingly."
+        #     run_spec = singleton(IncreaseMaxTokensRunExpander(value=25_000).expand(run_spec))
         # IDEFICS special handling
         if IDEFICS_MODEL_TAG in model.tags:

helm/benchmark/run_specs/audio_run_specs.py CHANGED Viewed

@@ -113,6 +113,18 @@ def _get_gpt4_critique_metric_specs(num_respondents: int, max_tokens: int) -> Li
     ]
+def _get_gpt4_refusal_metric_specs(num_respondents: int, max_tokens: int) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.gpt4_audio_refusal_metrics.GPT4AudioRefusalCritiqueMetric",
+            args={
+                "num_respondents": num_respondents,
+                "max_tokens": max_tokens,
+            },
+        )
+    ]
 ########################################################################################################################
 # RunSpecs
@@ -215,16 +227,20 @@ def get_mustard_audio_run_spec() -> RunSpec:
 @run_spec_function("voice_jailbreak_attacks")
-def get_voice_jailbreak_attacks_run_spec(subset: str) -> RunSpec:
+def get_voice_jailbreak_attacks_run_spec(subset: str, num_respondents: int = 1) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.audio_language.voice_jailbreak_attacks_scenario."
         "VoiceJailbreakAttacksScenario",
         args={"subset": subset},
     )
-    adapter_spec = _get_generation_adapter_spec(max_tokens=1024)
-    metric_specs: List[MetricSpec] = get_generative_harms_metric_specs(
-        include_basic_metrics=True, include_generative_harms_metrics=True
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="Listen to the audio and respond according to its instructions.",
+        max_tokens=1024,
     )
+    metric_specs: List[MetricSpec] = _get_gpt4_refusal_metric_specs(
+        num_respondents=num_respondents,
+        max_tokens=200,
+    ) + get_generative_harms_metric_specs(include_basic_metrics=True, include_generative_harms_metrics=True)
     run_spec_name: str = "voice_jailbreak_attacks"
     return RunSpec(
@@ -258,19 +274,20 @@ def get_covost2_run_spec(source_language: str, target_language: str) -> RunSpec:
 @run_spec_function("vocal_sound")
-def get_vocal_sound_run_spec() -> RunSpec:
+def get_vocal_sound_run_spec(sound: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.audio_language.vocal_sound_scenario.VocalSoundScenario",
+        args={"sound": sound},
     )
     adapter_spec = _get_generation_adapter_spec(
         instructions="Listen to the audio and classify the speaker behavior. Choose only from these options:"
         '"Cough", "Laughter", "Sigh", "Sneeze", "Sniff", or "Throat clearing". Respond with just the behavior.',
         max_tokens=5,
     )
-    metric_specs = get_exact_match_metric_specs() + get_classification_metric_specs()
+    metric_specs = get_exact_match_metric_specs()
     run_spec_name: str = "vocal_sound"
     return RunSpec(
-        name=run_spec_name,
+        name=f"{run_spec_name}:sound={sound}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
@@ -501,13 +518,20 @@ def get_air_bench_chat_run_spec(subject: str, num_respondents: int = 1) -> RunSp
         )
         + _get_open_ended_generation_metric_specs()
     )
     run_spec_name: str = "air_bench_chat"
+    group_name: str = run_spec_name
+    if subject in ["mix", "speech"]:
+        group_name += "_reasoning"
+    elif subject in ["sound", "music"]:
+        group_name += "_knowledge"
     return RunSpec(
         name=f"{run_spec_name}:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=[run_spec_name],
+        groups=[group_name],
     )
@@ -611,3 +635,23 @@ def get_parade_run_spec(voice: str, subset: str) -> RunSpec:
         metric_specs=metric_specs,
         groups=[run_spec_name],
     )
+@run_spec_function("corebench")
+def get_corebench_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.audio_language.corebench_scenario.COREBenchScenario",
+    )
+    adapter_spec = _get_generation_adapter_spec(
+        instructions="",
+        max_tokens=10,
+    )
+    metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
+    run_spec_name: str = "corebench"
+    return RunSpec(
+        name=f"{run_spec_name}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+    )

helm/benchmark/run_specs/enterprise_run_specs.py CHANGED Viewed

@@ -100,6 +100,26 @@ def get_conv_fin_qa_calc_spec() -> RunSpec:
     )
+@run_spec_function("kpi_edgar")
+def get_kpi_edgar_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.kpi_edgar_scenario.KPIEDGARScenario",
+    )
+    adapter_spec = get_generation_adapter_spec(
+        input_noun=None, output_noun="Answer", max_tokens=100, max_train_instances=20
+    )
+    return RunSpec(
+        name="kpi_edgar",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs([])
+        + [MetricSpec(class_name="helm.benchmark.metrics.kpi_edgar_metrics.KPIEdgarMetric")],
+        groups=["kpi_edgar"],
+    )
 # Legal

helm/benchmark/run_specs/experimental_run_specs.py CHANGED Viewed

@@ -6,7 +6,11 @@ from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
 from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
 from helm.benchmark.annotation.annotator import AnnotatorSpec
-from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+    get_exact_match_metric_specs,
+    get_open_ended_generation_metric_specs,
+)
 from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
@@ -192,3 +196,29 @@ def get_czech_bank_qa_spec(config_name: str = "berka_queries_1024_2024_12_18") -
         annotators=[AnnotatorSpec("helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator")],
         groups=["czech_bank_qa"],
     )
+@run_spec_function("medi_qa_without_annotator")
+def get_medi_qa_without_annotator_spec() -> RunSpec:
+    """A version of medi_qa that does not use annotators.
+    EXPERIMENTAL: You should probably use medi_qa instead."""
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medi_qa_scenario.MediQAScenario", args={})
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Answer the following consumer health question.",
+        input_noun="Question",
+        output_noun="Answer",
+        max_tokens=1024,
+        max_train_instances=0,
+        stop_sequences=[],
+    )
+    metric_specs = get_open_ended_generation_metric_specs()
+    return RunSpec(
+        name="medi_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["medi_qa"],
+    )

helm/benchmark/run_specs/long_context_run_specs.py CHANGED Viewed

@@ -1,5 +1,9 @@
-from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
-from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_open_ended_generation_metric_specs
+from helm.benchmark.adaptation.adapter_spec import ADAPT_CHAT, ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.metrics.common_metric_specs import (
+    get_exact_match_metric_specs,
+    get_open_ended_generation_metric_specs,
+)
+from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
@@ -26,7 +30,7 @@ def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
 @run_spec_function("ruler_hotpotqa")
-def get_ruler_hotpotqa_spec(max_num_words: int = 65536) -> RunSpec:
+def get_ruler_hotpotqa_spec(max_num_words: int = 131072) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERHotpotQAScenario",
         args={
@@ -35,18 +39,21 @@ def get_ruler_hotpotqa_spec(max_num_words: int = 65536) -> RunSpec:
     )
     adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
+    metric_specs = get_open_ended_generation_metric_specs() + [
+        MetricSpec(class_name="helm.benchmark.metrics.ruler_qa_metrics.RulerQAMetric")
+    ]
     return RunSpec(
         name=f"ruler_hotpotqa:max_num_words={max_num_words}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_open_ended_generation_metric_specs(),
+        metric_specs=metric_specs,
         groups=["ruler_hotpotqa"],
     )
 @run_spec_function("ruler_squad")
-def get_ruler_squad_spec(max_num_words: int = 65536) -> RunSpec:
+def get_ruler_squad_spec(max_num_words: int = 131072) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERSQuADScenario",
         args={
@@ -55,35 +62,80 @@ def get_ruler_squad_spec(max_num_words: int = 65536) -> RunSpec:
     )
     adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
+    metric_specs = get_open_ended_generation_metric_specs() + [
+        MetricSpec(class_name="helm.benchmark.metrics.ruler_qa_metrics.RulerQAMetric")
+    ]
     return RunSpec(
         name=f"ruler_squad:max_num_words={max_num_words}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_open_ended_generation_metric_specs(),
+        metric_specs=metric_specs,
         groups=["ruler_squad"],
     )
-@run_spec_function("infinite_bench_sum")
-def get_infinite_bench_sum_spec(min_num_words: int = 0, max_num_words: int = 65536) -> RunSpec:
+@run_spec_function("infinite_bench_en_qa")
+def get_infinite_bench_en_qa_spec(max_num_words: int = 131072) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.infinite_bench_en_qa_scenario.InfiniteBenchEnQAScenario",
+        args={
+            "max_num_words": max_num_words,
+        },
+    )
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=40)
+    metric_specs = get_open_ended_generation_metric_specs()
+    return RunSpec(
+        name=f"infinite_bench_en_qa:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["infinite_bench_en_qa"],
+    )
+@run_spec_function("infinite_bench_en_sum")
+def get_infinite_bench_en_sum_spec(max_num_words: int = 131072) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.infinite_bench_sum_scenario.InfiniteBenchSumScenario",
+        class_name="helm.benchmark.scenarios.infinite_bench_en_sum_scenario.InfiniteBenchEnSumScenario",
         args={
-            "min_num_words": min_num_words,
             "max_num_words": max_num_words,
         },
     )
-    # No official number for max tokens, the average output token is 1.1k according to the paper
-    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=2000)
-    metric_specs = get_basic_metric_specs(["rouge_l"])
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=1200)
+    metric_specs = get_open_ended_generation_metric_specs()
+    return RunSpec(
+        name=f"infinite_bench_en_sum:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["infinite_bench_en_sum"],
+    )
+@run_spec_function("openai_mrcr")
+def get_openai_mrcr_spec(needles: int, max_num_words: int = 131072) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.openai_mrcr_scenario.OpenAIMRCRScenario",
+        args={"needles": needles, "max_num_words": max_num_words},
+    )
+    adapter_spec = AdapterSpec(
+        method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
+    )
+    metric_specs = get_exact_match_metric_specs() + [
+        MetricSpec(class_name="helm.benchmark.metrics.openai_mrcr_metrics.OpenAIMRCRMetric")
+    ]
     return RunSpec(
-        name=f"infinite_bench_sum:min_num_words={min_num_words},max_num_words={max_num_words}",
+        name=f"openai_mrcr:needles={needles},max_num_words={max_num_words}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=["infinite_bench_sum"],
+        groups=["openai_mrcr"],
     )

crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.6py3-none-any.whl