PyPI - crfm-helm - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +11 -8
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +67 -38
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +2 -1
helm/benchmark/__init__.py +13 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/augmentations/correct_to_misspelling.json +1 -0
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +70 -0
helm/benchmark/metrics/machine_translation_metrics.py +36 -0
helm/benchmark/metrics/summarization_metrics.py +7 -8
helm/benchmark/metrics/test_classification_metrics.py +150 -0
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/run_display.py +7 -48
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +144 -48
helm/benchmark/run_expander.py +164 -47
helm/benchmark/run_specs.py +346 -39
helm/benchmark/runner.py +34 -6
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +84 -0
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +253 -0
helm/benchmark/scenarios/lextreme_scenario.py +458 -0
helm/benchmark/scenarios/me_q_sum_scenario.py +86 -0
helm/benchmark/scenarios/med_dialog_scenario.py +132 -0
helm/benchmark/scenarios/med_mcqa_scenario.py +102 -0
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +119 -0
helm/benchmark/scenarios/med_qa_scenario.py +96 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/wmt_14_scenario.py +96 -0
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +154 -1
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/santacoder_window_service.py +27 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +34 -7
helm/common/codec.py +123 -0
helm/common/general.py +12 -5
helm/common/test_codec.py +144 -0
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +32 -24
helm/proxy/clients/google_client.py +88 -0
helm/proxy/clients/huggingface_client.py +32 -16
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +25 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/test_huggingface_tokenizer.py +3 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +115 -7
helm/proxy/test_models.py +1 -1
helm/benchmark/presentation/present.py +0 -249
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.0.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/benchmark/run_specs.py CHANGED Viewed

@@ -14,15 +14,28 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
 from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from .metrics.metric import MetricSpec
-from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander
+from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander, ChatMLRunExpander
 from .runner import RunSpec
+from .scenarios.lex_glue_scenario import (
+    get_lex_glue_max_train_instances,
+    get_lex_glue_instructions,
+    get_lex_glue_max_tokens,
+    get_lex_glue_task_type,
+)
 from .scenarios.scenario import ScenarioSpec
 from .scenarios.big_bench_scenario import BIGBenchScenario
 from .scenarios.msmarco_scenario import MSMARCOScenario
 from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
 from .scenarios.copyright_scenario import datatag2hash_code
 from .scenarios.raft_scenario import get_raft_instructions
-from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG
+from .scenarios.lextreme_scenario import (
+    get_lextreme_instructions,
+    get_lextreme_max_train_instances,
+    get_lextreme_max_tokens,
+    TaskType,
+    get_lextreme_task_type,
+)
+from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG
 from helm.common.general import singleton
@@ -37,7 +50,14 @@ def format_instructions(instructions: str) -> str:
 def get_multiple_choice_joint_adapter_spec(
-    instructions: str, input_noun: Optional[str], output_noun: str, max_train_instances: int = 5, **kwargs
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train: bool = True,
+    **kwargs,
 ) -> AdapterSpec:
     """
     [instructions]
@@ -54,6 +74,7 @@ def get_multiple_choice_joint_adapter_spec(
     [reference_k]
     [output_noun]:
     """
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
@@ -62,10 +83,11 @@ def get_multiple_choice_joint_adapter_spec(
         output_prefix=f"{output_noun}: ",
         output_suffix="\n",
         max_train_instances=max_train_instances,
-        num_outputs=1,
-        max_tokens=5,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
         temperature=0.0,
         stop_sequences=["\n"],
+        sample_train=sample_train,
         **kwargs,
     )
@@ -99,15 +121,26 @@ def get_multiple_choice_adapter_spec(
     input_noun: Optional[str],
     output_noun: str,
     max_train_instances: int = 5,
+    num_outputs: int = 5,
+    max_tokens: int = 5,
     empty_input: bool = False,
+    sample_train: bool = True,
     **kwargs,
 ):
     """
     Toggle between joint and separate adapters.
     """
     if method == ADAPT_MULTIPLE_CHOICE_JOINT:
         return get_multiple_choice_joint_adapter_spec(
-            instructions, input_noun, output_noun, max_train_instances, **kwargs
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
         )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
@@ -304,6 +337,27 @@ def get_summarization_adapter_spec(num_sents: int, **kwargs) -> AdapterSpec:
     )
+def get_machine_translation_adapter_spec(
+    source_language, target_language, max_train_instances, **kwargs
+) -> AdapterSpec:
+    """
+    Used for machine translation.
+    """
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions=f"Translate {source_language} to {target_language}:",
+        input_prefix="",
+        input_suffix=" = ",
+        output_prefix="",
+        output_suffix="\n",
+        max_train_instances=max_train_instances,
+        num_outputs=1,
+        stop_sequences=["\n\n"],
+        temperature=0.0,
+        **kwargs,
+    )
 ############################################################
 # Examples of scenario and adapter specs
@@ -354,6 +408,14 @@ def get_f1_metric_specs() -> List[MetricSpec]:
     return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
+def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
+        )
+    ]
 def get_bbq_metric_specs() -> List[MetricSpec]:
     return [MetricSpec(class_name="helm.benchmark.bbq_metrics.BBQMetric", args={})] + get_exact_match_metric_specs()
@@ -479,6 +541,16 @@ def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
         return [MetricSpec(class_name="helm.benchmark.code_metrics.APPSMetric", args=args)]
+def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
+    return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
+def get_machine_translation_metric_specs() -> List[MetricSpec]:
+    return [
+        MetricSpec(class_name="helm.benchmark.machine_translation_metrics.MachineTranslationMetric", args={})
+    ] + get_basic_metric_specs([])
 ############################################################
 # Run specs
@@ -564,7 +636,9 @@ def get_civil_comments_spec(demographic: str) -> RunSpec:
         name=f"civil_comments:demographic={demographic}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
+        metric_specs=get_exact_match_metric_specs()
+        + get_generative_harms_metric_specs()
+        + get_classification_metric_specs(),
         groups=["civil_comments"],
     )
@@ -809,7 +883,9 @@ def get_raft_spec(subset: str) -> RunSpec:
         name=f"raft:subset={subset}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs() + get_generative_harms_metric_specs(),
+        metric_specs=get_exact_match_metric_specs()
+        + get_generative_harms_metric_specs()
+        + get_classification_metric_specs(),
         groups=["raft"],
     )
@@ -971,7 +1047,7 @@ def get_imdb_spec(only_contrast=False) -> RunSpec:
         name="imdb" + (":only_contrast=True" if only_contrast else ""),
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
         groups=["imdb"],
     )
@@ -1182,10 +1258,7 @@ def get_narrativeqa_spec() -> RunSpec:
         name="narrative_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_basic_metric_specs(
-            ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"]
-        )
-        + get_generative_harms_metric_specs(),
+        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
         groups=["narrative_qa"],
     )
@@ -1509,7 +1582,7 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
     adapter_spec = AdapterSpec(
         method=get_adaptation_method(big_bench_task["metrics"]),
         model="openai/text-curie-001",  # Can override with the `ModelRunExpander`.
-        max_train_instances=0,  # Can override with the `MaxTrainInstancesRunExpander`.
+        max_train_instances=5,  # Can override with the `MaxTrainInstancesRunExpander`.
         num_outputs=1,  # Can override with the `NumOutputsRunExpander`.
         # From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
         # for the BIG-G models tested on BIG-bench, "we use an input context length of 1,024 tokens
@@ -1541,36 +1614,136 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
     )
-def get_pubmed_qa_spec(prompt_answer_choices: str) -> RunSpec:
+def get_covid_dialog_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.covid_dialog_scenario.COVIDDialogScenario", args={}
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Generate a response given a patient's questions and concerns.",
+        input_noun="Patient",
+        output_noun="Doctor",
+        max_tokens=128,
+    )
+    return RunSpec(
+        name="covid_dialog",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
+        groups=["COVIDDialog"],
+    )
+def get_me_q_sum_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.me_q_sum_scenario.MeQSumScenario", args={})
+    adapter_spec = get_summarization_adapter_spec(
+        num_sents=1,
+        max_tokens=128,
+        temperature=0.3,
+    )
+    return RunSpec(
+        name="me_q_sum",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
+        groups=["MeQSum"],
+    )
+def get_med_dialog_spec(subset: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
+    )
+    adapter_spec = get_summarization_adapter_spec(
+        num_sents=1,
+        max_tokens=128,
+        temperature=0.3,
+    )
+    return RunSpec(
+        name=f"med_dialog,subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
+        groups=["MedDialog"],
+    )
+def get_med_mcqa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_mcqa_scenario.MedMCQAScenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Give a letter answer among A, B, C or D.",
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name="med_mcqa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["MedMCQA"],
+    )
+def get_med_paragraph_simplification_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.med_paragraph_simplification_scenario.MedParagraphSimplificationScenario",
+        args={},
+    )
+    adapter_spec = get_summarization_adapter_spec(
+        num_sents=10,
+        max_tokens=512,
+        temperature=0.3,
+    )
+    return RunSpec(
+        name="med_paragraph_simplification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
+        groups=["MedParagraphSimplification"],
+    )
+def get_med_qa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Give a letter answer among A, B, C or D.",
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name="med_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["MedQA"],
+    )
+def get_pubmed_qa_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
-    # We are trying to reproduce the Instruct-GPT3's zero-shot performance of 73.2% from
-    # "Can large language models reason about medical questions?" (Liévin et al.).
-    # Therefore, specify the values of the fields of `AdapterSpec` based on experiment details of the paper.
-    # Set `output_prefix` based on Table 1 (titled "Prompt templates") of the paper.
-    output_prefix: str = "Answer: "
-    if prompt_answer_choices.lower() == "true":
-        output_prefix += "among A through C, the answer is "
-    # Liévin et al. followed what Kojima et al. did in "Large Language Models are Zero-Shot Reasoners."
-    # to extract answers from completions: set the max completion length to a large number and
-    # "...pick up the first large letter encountered in the text." Then they set "'Q:'...as a customized stop
-    # sequence for all the models except for Instruct-GPT3 to stop the models from repeating questions and
-    # answers by themselves." We don't need to do this since our framework has a "multiple_choice_joint"
-    # adaptation method that handles the prompt construction for multiple-choice QA for us.
-    adapter_spec = AdapterSpec(
+    adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        max_train_instances=0,  # We want to reproduce the zero-shot performance.
-        # "We sampled one completion per prompt with a temperature of zero..."
-        num_outputs=1,
-        temperature=0,
-        input_prefix="",
-        output_prefix=output_prefix,
-        # Following the examples in https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html
-        reference_prefix="A) ",
+        instructions="Answer A for yes, B for no or C for maybe.",
+        input_noun="Question",
+        output_noun="Answer",
     )
     return RunSpec(
-        name=f"pubmed_qa:prompt_answer_choices={prompt_answer_choices}",
+        name="pubmed_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
@@ -1578,6 +1751,125 @@ def get_pubmed_qa_spec(prompt_answer_choices: str) -> RunSpec:
     )
+def build_classification_metrics(task_type):
+    if task_type in [TaskType.QA, TaskType.SLTC]:
+        return get_classification_metric_specs(delimiter=None)
+    elif task_type == TaskType.MLTC:
+        return get_classification_metric_specs(delimiter=",")
+    return []
+def get_lextreme_spec(subset: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.lextreme_scenario.LEXTREMEScenario",
+        args={"subset": subset},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=get_lextreme_instructions(subset),
+        input_noun="Passage",
+        output_noun="Answer",
+        max_tokens=get_lextreme_max_tokens(subset),
+        max_train_instances=get_lextreme_max_train_instances(subset),  # in some subsets the input is very long
+    )
+    return RunSpec(
+        name=f"lextreme:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=build_classification_metrics(get_lextreme_task_type(subset)),
+        groups=["lextreme"],
+    )
+def get_lex_glue_spec(subset: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.lex_glue_scenario.LexGLUEScenario",
+        args={"subset": subset},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=get_lex_glue_instructions(subset),
+        input_noun="Passage",
+        output_noun="Answer",
+        max_tokens=get_lex_glue_max_tokens(subset),
+        max_train_instances=get_lex_glue_max_train_instances(subset),  # in some subsets the input is very long
+    )
+    return RunSpec(
+        name=f"lex_glue:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=build_classification_metrics(get_lex_glue_task_type(subset)),
+        groups=["lex_glue"],
+    )
+def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec:
+    FULL_LANGUAGE_NAMES = {
+        "cs": "Czech",
+        "de": "German",
+        "fr": "French",
+        "hi": "Hindi",
+        "ru": "Russian",
+        "en": "English",
+    }
+    source_language, target_language = language_pair.split("-")
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario",
+        args={"source_language": source_language, "target_language": target_language},
+    )
+    adapter_spec = get_machine_translation_adapter_spec(
+        source_language=FULL_LANGUAGE_NAMES[source_language],
+        target_language=FULL_LANGUAGE_NAMES[target_language],
+        max_train_instances=max_train_instances,
+    )
+    return RunSpec(
+        name=f"wmt_14:language_pair={language_pair}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_machine_translation_metric_specs(),
+        groups=["wmt_14"],
+    )
+def get_opinions_qa_spec(
+    survey_type: str,
+    num_logprobs: str,
+    context: str = "None",
+    num_train_trials: str = "1",
+    method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
+) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
+        args={"survey_type": survey_type, "context": context},
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="",
+        input_noun="Question",
+        output_noun="Answer",
+        max_train_instances=1 if "steer" in context else 0,
+        max_tokens=1,
+        num_outputs=int(num_logprobs),
+        num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
+        sample_train=False,
+    )
+    return RunSpec(
+        name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
+        + f",context={context},num_train_trials={num_train_trials}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=[],
+        groups=["opinions_qa"],
+    )
 ############################################################
 CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
@@ -1624,7 +1916,18 @@ CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
     "entity_data_imputation": get_entity_data_imputation_spec,
     "ice": get_ice_spec,
     "big_bench": get_big_bench_spec,
+    "lextreme": get_lextreme_spec,
+    "lex_glue": get_lex_glue_spec,
+    "wmt_14": get_wmt_14_spec,
+    # Biomedical
+    "covid_dialog": get_covid_dialog_spec,
+    "me_q_sum": get_me_q_sum_spec,
+    "med_dialog": get_med_dialog_spec,
+    "med_mcqa": get_med_mcqa_spec,
+    "med_paragraph_simplification": get_med_paragraph_simplification_spec,
+    "med_qa": get_med_qa_spec,
     "pubmed_qa": get_pubmed_qa_spec,
+    "opinions_qa": get_opinions_qa_spec,
 }
@@ -1667,6 +1970,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
             global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
             run_spec = singleton(global_prefix_expander.expand(run_spec))
+        if CHATML_MODEL_TAG in model.tags:
+            chatml_expander = ChatMLRunExpander()
+            run_spec = singleton(chatml_expander.expand(run_spec))
         return run_spec
     run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]

helm/benchmark/runner.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import json
 import os
+import traceback
 import typing
 from collections import Counter
 from dataclasses import dataclass, field
 from typing import List
+from tqdm import tqdm
 from helm.common.general import ensure_directory_exists, write, asdict_without_nones
 from helm.common.hierarchical_logger import hlog, htrack_block
 from helm.common.cache import cache_stats
@@ -23,6 +26,12 @@ from .metrics.tokens_metric import TokensMetric
 from .window_services.tokenizer_service import TokenizerService
+class RunnerError(Exception):
+    """Error that happens in the Runner."""
+    pass
 @dataclass(frozen=True)
 class RunSpec:
     """
@@ -68,15 +77,17 @@ class Runner:
         execution_spec: ExecutionSpec,
         output_path: str,
         suite: str,
-        run_specs: List[RunSpec],
         skip_instances: bool,
+        skip_completed_runs: bool,
+        exit_on_error: bool,
     ):
         self.executor = Executor(execution_spec)
         self.dry_run: bool = execution_spec.dry_run
         self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
         self.metric_service = MetricService(self.executor.service, execution_spec.auth)
-        self.run_specs: List[RunSpec] = run_specs
         self.skip_instances: bool = skip_instances
+        self.skip_completed_runs: bool = skip_completed_runs
+        self.exit_on_error: bool = exit_on_error
         ensure_directory_exists(output_path)
         # Decide where to save the raw data (e.g., "output/scenarios/mmlu").
@@ -90,10 +101,21 @@ class Runner:
         self.eval_cache_path: str = os.path.join(self.runs_path, "eval_cache")
         ensure_directory_exists(self.eval_cache_path)
-    def run_all(self):
-        for run_spec in self.run_specs:
-            with htrack_block(f"Running {run_spec.name}"):
-                self.run_one(run_spec)
+    def run_all(self, run_specs: List[RunSpec]):
+        failed_run_specs: List[RunSpec] = []
+        for run_spec in tqdm(run_specs, disable=None):
+            try:
+                with htrack_block(f"Running {run_spec.name}"):
+                    self.run_one(run_spec)
+            except Exception as e:
+                if self.exit_on_error:
+                    raise e
+                else:
+                    hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
+                    failed_run_specs.append(run_spec)
+        if not self.exit_on_error and failed_run_specs:
+            failed_runs_str = ", ".join([f'"{run_spec.name}"' for run_spec in failed_run_specs])
+            raise RunnerError(f"Failed runs: [{failed_runs_str}]")
     def run_one(self, run_spec: RunSpec):
         # Load the scenario
@@ -106,6 +128,12 @@ class Runner:
         run_path: str = os.path.join(self.runs_path, run_spec.name)
         ensure_directory_exists(run_path)
+        if self.skip_completed_runs and os.path.exists(os.path.join(run_path, "scenario_state.json")):
+            # If scenario_state.json exists, assume that all other output files exist
+            # because scenario_state.json is the last output file to be written.
+            hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
+            return
         # Fetch and initialize the Adapter based on the `AdapterSpec`.
         adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)

helm/benchmark/scenarios/copyright_scenario.py CHANGED Viewed

@@ -72,7 +72,7 @@ class CopyrightScenario(Scenario):
         # Read all the instances
         instances: List[Instance] = []
-        for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances"):
+        for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances", disable=None):
             instances.append(
                 Instance(
                     input=Input(text=prefix),

crfm-helm 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl