PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} RENAMED Viewed

@@ -1,798 +1,51 @@
-import importlib
-import itertools
-from functools import partial
-from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
+"""Run spec functions for the HELM Classic leaderboard.
-from helm.common.hierarchical_logger import hlog, htrack
-from helm.common.object_spec import ObjectSpec
-from helm.benchmark.adaptation.adapters.adapter_factory import (
-    ADAPT_LANGUAGE_MODELING,
+Website: https://crfm.stanford.edu/helm/classic/
+If a run spec function is included in both the HELM Classic leaderboard and the
+HELM Lite leaderboard, it will be included in the lite_run_specs module instead of this module.
+This module also contains some scenarios that are currently not used on any HELM leaderboard."""
+from typing import Any, Dict, List, Optional, Set
+from helm.benchmark.adaptation.adapter_spec import (
+    ADAPT_GENERATION,
     ADAPT_MULTIPLE_CHOICE_JOINT,
     ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
-    ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
-    ADAPT_GENERATION,
     ADAPT_RANKING_BINARY,
+    AdapterSpec,
 )
 from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
-from helm.common.optional_dependencies import handle_module_not_found_error
-from .metrics.metric import MetricSpec
-from .run_expander import (
-    RUN_EXPANDERS,
-    RunExpander,
-    GlobalPrefixRunExpander,
-    StopRunExpander,
-    ChatMLRunExpander,
-    AddToStopRunExpander,
-    IncreaseMaxTokensRunExpander,
-    FormatPromptRunExpander,
-    IncreaseTemperatureRunExpander,
-)
-from .runner import RunSpec
-from .scenarios.lex_glue_scenario import (
-    get_lex_glue_max_train_instances,
-    get_lex_glue_instructions,
-    get_lex_glue_max_tokens,
-    get_lex_glue_task_type,
+from helm.benchmark.adaptation.common_adapter_specs import (
+    get_completion_adapter_spec,
+    get_generation_adapter_spec,
+    get_language_modeling_adapter_spec,
+    get_multiple_choice_adapter_spec,
+    get_ranking_binary_adapter_spec,
+    get_summarization_adapter_spec,
 )
-from .scenarios.scenario import ScenarioSpec
-from .scenarios.big_bench_scenario import BIGBenchScenario
-from .scenarios.msmarco_scenario import MSMARCOScenario
-from .scenarios.copyright_scenario import datatag2hash_code
-from .scenarios.raft_scenario import get_raft_instructions
-from .scenarios.lextreme_scenario import (
-    get_lextreme_instructions,
-    get_lextreme_max_train_instances,
-    get_lextreme_max_tokens,
-    TaskType,
-    get_lextreme_task_type,
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+    get_bias_metric_specs,
+    get_classification_metric_specs,
+    get_copyright_metric_specs,
+    get_disinformation_metric_specs,
+    get_exact_match_metric_specs,
+    get_f1_metric_specs,
+    get_generative_harms_metric_specs,
+    get_language_modeling_metric_specs,
+    get_numeracy_metric_specs,
+    get_open_ended_generation_metric_specs,
+    get_summarization_metric_specs,
+    get_basic_generation_metric_specs,
+    get_basic_reference_metric_specs,
+    get_generic_metric_specs,
 )
-from helm.proxy.models import (
-    ANTHROPIC_CLAUDE_1_MODEL_TAG,
-    ANTHROPIC_CLAUDE_2_MODEL_TAG,
-    get_model,
-    NO_NEWLINES_TAG,
-    NLG_PREFIX_TAG,
-    CHATML_MODEL_TAG,
-    OPENAI_CHATGPT_MODEL_TAG,
-    BUGGY_TEMP_0_TAG,
-)
-from helm.common.general import singleton
-############################################################
-# Prototypical adapter specs
-def format_instructions(instructions: str) -> str:
-    if len(instructions) > 0:
-        instructions += "\n"
-    return instructions
-def get_multiple_choice_joint_adapter_spec(
-    instructions: str,
-    input_noun: Optional[str],
-    output_noun: str,
-    num_outputs: int = 5,
-    max_train_instances: int = 5,
-    max_tokens: int = 5,
-    sample_train: bool = True,
-    **kwargs,
-) -> AdapterSpec:
-    """
-    [instructions]
-    [input_noun]: [input]
-    [reference_1]
-    ...
-    [reference_k]
-    [output_noun]: [output]
-    [input_noun]: [input]
-    [reference_1]
-    ...
-    [reference_k]
-    [output_noun]:
-    """
-    return AdapterSpec(
-        method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions=format_instructions(instructions),
-        input_prefix=f"{input_noun}: " if input_noun is not None else "",
-        input_suffix="\n" if input_noun is not None else "",
-        output_prefix=f"{output_noun}: ",
-        output_suffix="\n",
-        max_train_instances=max_train_instances,
-        num_outputs=num_outputs,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        stop_sequences=["\n"],
-        sample_train=sample_train,
-        **kwargs,
-    )
-def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
-    """
-    [input] [reference_i]
-    or
-    [reference_i]
-    """
-    assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
-    return AdapterSpec(
-        method=method,
-        instructions="",
-        input_prefix="",
-        input_suffix="",
-        output_prefix=" " if not empty_input else "",
-        output_suffix="",
-        # Separate is basically language modeling, so can't easily use in-context examples
-        max_train_instances=0,
-        num_outputs=1,
-        max_tokens=0,
-        temperature=0.0,
-    )
-def get_multiple_choice_adapter_spec(
-    method: str,
-    instructions: str,
-    input_noun: Optional[str],
-    output_noun: str,
-    max_train_instances: int = 5,
-    num_outputs: int = 5,
-    max_tokens: int = 1,
-    empty_input: bool = False,
-    sample_train: bool = True,
-    **kwargs,
-):
-    """
-    Toggle between joint and separate adapters.
-    """
-    if method == ADAPT_MULTIPLE_CHOICE_JOINT:
-        return get_multiple_choice_joint_adapter_spec(
-            instructions,
-            input_noun,
-            output_noun,
-            max_train_instances=max_train_instances,
-            num_outputs=num_outputs,
-            max_tokens=max_tokens,
-            sample_train=sample_train,
-            **kwargs,
-        )
-    elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
-        return get_multiple_choice_separate_adapter_spec(method, empty_input)
-    else:
-        raise ValueError(f"Invalid adaptation method: {method}")
-def get_ranking_binary_adapter_spec(
-    instructions: str = "",
-    document_noun: str = "Passage",
-    query_noun: str = "Query",
-    output_prefix: str = "Does the passage answer the query?",
-    output_noun: str = "Answer",
-    max_train_instances: int = 4,
-    num_outputs: int = 1,
-    num_train_trials: int = 1,
-    temperature: float = 0.0,
-    max_tokens: int = 5,
-    **kwargs,
-) -> AdapterSpec:
-    """
-    [instructions]
-    [object_noun]: [object]
-    [query_noun]: [query]
-    [prompt_noun]: [prompt_content]
-    [output_noun]: [output]
-    ...
-    [object_noun]: [object]
-    [query_noun]: [query]
-    [prompt_noun]: [prompt_content]
-    [output_noun]: [output]
-    [object_noun]: [object]
-    [query_noun]: [query]
-    [prompt_noun]: [prompt_content]
-    [output_noun]: [output]
-    """
-    msg = (
-        "There must be an even number of in-context examples to ensure that"
-        "an equal number of positive and negative examples are included."
-    )
-    assert max_train_instances % 2 == 0, msg
-    max_train_instances = int(max_train_instances / 2)
-    return AdapterSpec(
-        method=ADAPT_RANKING_BINARY,
-        instructions=format_instructions(instructions),
-        input_prefix=f"{query_noun}: ",
-        input_suffix="\n",
-        reference_prefix=f"{document_noun}: ",
-        reference_suffix="\n",
-        output_prefix=f"{output_prefix}\n{output_noun}: ",
-        max_train_instances=max_train_instances,
-        num_outputs=num_outputs,
-        num_train_trials=num_train_trials,
-        temperature=temperature,
-        max_tokens=max_tokens,
-        **kwargs,
-    )
-def get_completion_adapter_spec(
-    instructions: str = "",
-    input_prefix: str = "",
-    output_prefix: str = "",
-    output_suffix: str = "",
-    max_train_instances: int = 0,
-    temperature: float = 0.0,
-    num_outputs: int = 1,
-    max_tokens: int = 100,
-    stop_sequences: Optional[List] = None,  # default value of `stop_sequences` is no stop sequence,
-    **kwargs,
-) -> AdapterSpec:
-    """
-    [input][output_prefix][output][output_suffix]
-    [input][output_prefix]
-    """
-    if stop_sequences is None:
-        stop_sequences = []
-    return AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions=format_instructions(instructions),
-        input_prefix=input_prefix,
-        input_suffix="",
-        output_prefix=output_prefix,
-        output_suffix=output_suffix,
-        max_train_instances=max_train_instances,
-        temperature=temperature,
-        num_outputs=num_outputs,
-        max_tokens=max_tokens,
-        stop_sequences=stop_sequences,
-        **kwargs,
-    )
-def get_generation_adapter_spec(
-    instructions: str = "",
-    input_noun: Optional[str] = None,
-    newline_after_input_noun: bool = False,
-    output_noun: Optional[str] = None,
-    newline_after_output_noun: bool = False,
-    max_train_instances: int = 5,
-    num_outputs: int = 1,
-    max_tokens: int = 5,
-    stop_sequences: Optional[List] = None,  # default value of `stop_sequences` is ["\n"]
-    temperature: float = 0.0,
-    multi_label: bool = False,
-) -> AdapterSpec:
-    """
-    [instructions]
-    [input_noun]: [input]
-    [output_noun]: [output]
-    [input_noun]: [input]
-    [output_noun]:
-    """
-    def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
-        """
-        When `append_new_line` is False:
-            [input_noun]: [input]
-        When `append_new_line` is True:
-            [input_noun]:
-            [input]
-        """
-        prefix: str = f"{noun}:" if noun is not None else ""
-        if len(prefix) > 0:
-            prefix += "\n" if append_new_line else " "
-        return prefix
-    if stop_sequences is None:
-        stop_sequences = ["\n"]
-    return AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions=format_instructions(instructions),
-        input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
-        input_suffix="\n",
-        output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
-        output_suffix="\n",
-        max_train_instances=max_train_instances,
-        num_outputs=num_outputs,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        stop_sequences=stop_sequences,
-        multi_label=multi_label,
-    )
-def get_instruct_adapter_spec(
-    num_outputs: int = 1,
-    max_tokens: int = 512,
-    temperature: float = 0.7,
-) -> AdapterSpec:
-    """
-    Zero-shot instruction-following.
-    """
-    return AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions="",
-        input_prefix="",
-        input_suffix="\n",
-        output_prefix="",
-        output_suffix="",
-        max_train_instances=0,
-        num_outputs=num_outputs,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        stop_sequences=[],
-    )
-def get_language_modeling_adapter_spec() -> AdapterSpec:
-    """
-    Used for language modeling.
-    """
-    return AdapterSpec(
-        method=ADAPT_LANGUAGE_MODELING,
-        instructions="",
-        input_prefix="",
-        input_suffix="",
-        output_prefix="",
-        output_suffix="",
-        max_train_instances=0,
-        num_outputs=1,
-        max_tokens=0,
-        temperature=0.0,
-    )
-def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
-    """
-    Used for summarization.
-    """
-    if num_sents == 1:
-        out_pref = "Summarize the above article in 1 sentence.\n"
-    elif num_sents is None:
-        out_pref = "Summarize the above article.\n"
-    else:
-        out_pref = f"Summarize the above article in {num_sents} sentences.\n"
-    return AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions="",
-        input_prefix="###\nArticle: ",
-        input_suffix="\n\n",
-        output_prefix=out_pref,
-        output_suffix="\n",
-        max_train_instances=max_train_instances,
-        num_outputs=1,
-        stop_sequences=["###"],  # Separator between few-shot instances.
-        **kwargs,
-    )
-def get_machine_translation_adapter_spec(
-    source_language, target_language, max_train_instances, **kwargs
-) -> AdapterSpec:
-    """
-    Used for machine translation.
-    """
-    return AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions=f"Translate {source_language} to {target_language}:",
-        input_prefix="",
-        input_suffix=" = ",
-        output_prefix="",
-        output_suffix="\n",
-        max_train_instances=max_train_instances,
-        num_outputs=1,
-        stop_sequences=["\n\n"],
-        temperature=0.0,
-        **kwargs,
-    )
-############################################################
-# Examples of scenario and adapter specs
-def get_scenario_spec1() -> ScenarioSpec:
-    return ScenarioSpec(
-        class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
-        args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 10, "num_test_instances": 10},
-    )
-def get_scenario_spec_tiny():
-    return ScenarioSpec(
-        class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
-        args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2},
-    )
-def get_adapter_spec1() -> AdapterSpec:
-    return AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions="Please solve the following problem.\n",
-        max_train_instances=5,
-        max_eval_instances=10,
-        num_outputs=3,
-        num_train_trials=3,
-        model="simple/model1",
-        temperature=1,
-        stop_sequences=["."],
-    )
-############################################################
-# Metrics
-def get_basic_metric_specs(names: List[str]) -> List[MetricSpec]:
-    return [MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.BasicMetric", args={"names": names})]
-def get_exact_match_metric_specs() -> List[MetricSpec]:
-    return get_basic_metric_specs(
-        ["exact_match", "quasi_exact_match", "prefix_exact_match", "quasi_prefix_exact_match"]
-    )
-def get_f1_metric_specs() -> List[MetricSpec]:
-    return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
-def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
-            args={"delimiter": delimiter},
-        )
-    ]
-def get_multiple_choice_classification_metric_specs() -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.classification_metrics.MultipleChoiceClassificationMetric", args={}
-        )
-    ]
-def get_bbq_metric_specs() -> List[MetricSpec]:
-    return [
-        MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
-    ] + get_exact_match_metric_specs()
-def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[MetricSpec]:
-    # Names of the measures we want to compute.
-    measure_names = MSMARCOScenario.MEASURE_NAMES[track]
-    multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
-            args={
-                "method": ADAPT_RANKING_BINARY,
-                "measure_names": measure_names,
-                "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
-                "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
-                "rank": rank,
-                "multiple_relevance_values": multiple_relevance_values,
-            },
-        ),
-    ] + get_basic_metric_specs(names=[])
-def get_toxicity_metric_specs() -> List[MetricSpec]:
-    return [
-        MetricSpec(class_name="helm.benchmark.metrics.toxicity_metrics.ToxicityMetric", args={}),
-    ]
-def get_bias_metric_specs() -> List[MetricSpec]:
-    demographic_categories = ["race", "gender"]
-    target_categories = ["adjective", "profession"]
-    cross_dem_target = itertools.product(demographic_categories, target_categories)
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
-            args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
-        )
-        for dem, tgt in cross_dem_target
-    ] + [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
-            args={"mode": "representation", "demographic_category": dem},
-        )
-        for dem in demographic_categories
-    ]
-def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
-    return (
-        get_bias_metric_specs()
-        + get_toxicity_metric_specs()
-        + (get_basic_metric_specs([]) if include_basic_metrics else [])
-    )
-def get_summarization_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
-    return [
-        MetricSpec(class_name="helm.benchmark.metrics.summarization_metrics.SummarizationMetric", args=args)
-    ] + get_basic_metric_specs([])
-def get_summarization_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.summarization_critique_metrics.SummarizationCritiqueMetric",
-            args={"num_respondents": num_respondents},
-        )
-    ]
-def get_srn_metric_specs() -> List[MetricSpec]:
-    return get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
-def get_numeracy_metric_specs(run_solver: bool = False) -> List[MetricSpec]:
-    metric_specs: List[MetricSpec] = get_basic_metric_specs(
-        ["exact_match", "quasi_exact_match", "absolute_value_difference"]
-    )
-    # The solvers are slow to run so make them skippable
-    if run_solver:
-        metric_specs += [
-            MetricSpec(class_name="helm.benchmark.metrics.numeracy_metrics.DistanceMetric", args={}),
-        ]
-    return metric_specs
-def get_math_metric_specs(use_chain_of_thought: bool = True) -> List[MetricSpec]:
-    return get_basic_metric_specs(["math_equiv_chain_of_thought" if use_chain_of_thought else "math_equiv"])
-def get_copyright_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
-    if args is None:
-        args = {}
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
-            args={**args, "name": "longest_common_prefix_length"},
-        ),
-        MetricSpec(
-            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
-            args={**args, "name": "edit_distance"},
-        ),
-        MetricSpec(
-            class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
-            args={**args, "name": "edit_similarity"},
-        ),
-    ] + get_basic_metric_specs([])
-def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
-    if args is None:
-        args = {}
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}
-        ),
-        MetricSpec(
-            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}
-        ),
-        MetricSpec(
-            class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric",
-            args={"name": "monte_carlo_entropy"},
-        ),
-    ] + get_basic_metric_specs([])
-def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
-    if dataset == "humaneval":
-        return get_basic_metric_specs(["code_eval_acc", "pass"])
-    else:  # APPS.
-        args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
-        return [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
-def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
-    return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
-def get_machine_translation_metric_specs() -> List[MetricSpec]:
-    return [
-        MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric", args={})
-    ] + get_basic_metric_specs([])
-def get_cleva_machine_translation_metric_specs() -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.machine_translation_metrics.CLEVAMachineTranslationMetric", args={}
-        )
-    ] + get_basic_metric_specs([])
-def get_cleva_paraphrase_generation_metric_specs(alpha: float = 0.8) -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.paraphrase_generation_metrics.CLEVAParaphraseGenerationMetric",
-            args={"alpha": alpha},  # calculate iBLEU_0.8 by default
-        )
-    ] + get_basic_metric_specs([])
-def get_verifiability_judgment_metric_specs() -> List[MetricSpec]:
-    return get_basic_metric_specs(["exact_match", "quasi_exact_match"])
-def get_instruction_following_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.instruction_following_critique_metrics.InstructionFollowingCritiqueMetric",  # noqa E501
-            args={"num_respondents": num_respondents},
-        )
-    ]
-def get_cleva_topk_accuracy_metric_specs(k: int = 1, cut_off: int = 5) -> List[MetricSpec]:
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.cleva_accuracy_metrics.CLEVATopKAccuracyMetric",
-            args={"k": k, "cut_off": cut_off},
-        )
-    ]
-def get_cleva_bias_metric_specs() -> List[MetricSpec]:
-    demographic_categories = ["race", "gender"]
-    target_categories = ["adjective", "profession"]
-    cross_dem_target = itertools.product(demographic_categories, target_categories)
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
-            args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
-        )
-        for dem, tgt in cross_dem_target
-    ] + [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
-            args={"mode": "representation", "demographic_category": dem},
-        )
-        for dem in demographic_categories
-    ]
-def get_cleva_toxicity_metric_specs() -> List[MetricSpec]:
-    return [
-        MetricSpec(class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVAToxicityMetric", args={}),
-    ]
-def get_cleva_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
-    return (
-        get_cleva_bias_metric_specs()
-        + get_cleva_toxicity_metric_specs()
-        + (get_basic_metric_specs([]) if include_basic_metrics else [])
-    )
-def get_cleva_copyright_metric_spec(args: Optional[Dict] = None) -> List[MetricSpec]:
-    if args is None:
-        args = {}
-    return [
-        MetricSpec(
-            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
-            args={**args, "name": "longest_common_prefix_length"},
-        ),
-        MetricSpec(
-            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
-            args={**args, "name": "edit_distance"},
-        ),
-        MetricSpec(
-            class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
-            args={**args, "name": "edit_similarity"},
-        ),
-    ]
-def get_cleva_generative_task_metric_spec(task: str, subtask: Optional[str], **kwargs) -> List[MetricSpec]:
-    CLEVA_GEN_TASK_TO_METRIC: Dict[str, Callable] = {
-        "opinion_mining:opinion_target_extraction": get_exact_match_metric_specs,
-        "paraphrase_generation": get_cleva_paraphrase_generation_metric_specs,
-        "closed_book_question_answering:generative_question_answering": get_exact_match_metric_specs,
-        "conceptual_generalization": get_cleva_topk_accuracy_metric_specs,
-        "translation:en2zh": get_cleva_machine_translation_metric_specs,
-        "translation:zh2en": get_cleva_machine_translation_metric_specs,
-        "mathematical_calculation:add": get_exact_match_metric_specs,
-        "mathematical_calculation:sub": get_exact_match_metric_specs,
-        "mathematical_calculation:mul": get_exact_match_metric_specs,
-        "inductive_reasoning:add": get_exact_match_metric_specs,
-        "inductive_reasoning:sub": get_exact_match_metric_specs,
-        "inductive_reasoning:mul": get_exact_match_metric_specs,
-        "reasoning_primitive:dyck_language": get_exact_match_metric_specs,
-        "reasoning_primitive:pattern_induction": get_exact_match_metric_specs,
-        "reasoning_primitive:pattern_matching": get_exact_match_metric_specs,
-        "reasoning_primitive:variable_sub": get_exact_match_metric_specs,
-        "subject_knowledge:art": get_exact_match_metric_specs,
-        "subject_knowledge:biomedicine": get_exact_match_metric_specs,
-        "subject_knowledge:chemistry": get_exact_match_metric_specs,
-        "subject_knowledge:computer_science": get_exact_match_metric_specs,
-        "subject_knowledge:economics": get_exact_match_metric_specs,
-        "subject_knowledge:geography": get_exact_match_metric_specs,
-        "subject_knowledge:history": get_exact_match_metric_specs,
-        "subject_knowledge:law": get_exact_match_metric_specs,
-        "subject_knowledge:literature": get_exact_match_metric_specs,
-        "subject_knowledge:math": get_exact_match_metric_specs,
-        "subject_knowledge:other_general": get_exact_match_metric_specs,
-        "subject_knowledge:philosophy": get_exact_match_metric_specs,
-        "subject_knowledge:physics": get_exact_match_metric_specs,
-        "subject_knowledge:politics": get_exact_match_metric_specs,
-        "summarization:dialogue_summarization": partial(get_basic_metric_specs, ["chinese_rouge_2"]),
-        "pinyin_transliteration:pinyin2zh": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
-        "pinyin_transliteration:zh2pinyin": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
-        "dialogue_generation:task_oriented": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
-        "data_to_text_generation": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
-        "mathematical_reasoning:math_word_problem": partial(get_basic_metric_specs, ["cleva_math_result_match"]),
-    }
-    key: str = task
-    if subtask is not None:
-        key += ":" + subtask
-    return CLEVA_GEN_TASK_TO_METRIC[key](**kwargs)
-############################################################
-# Run specs
-CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {}
-"""Dict of run spec function names to run spec functions."""
-F = TypeVar("F", bound=Callable[..., RunSpec])
-def run_spec_function(name: str) -> Callable[[F], F]:
-    """Register the run spec function under the given name."""
-    def wrap(func: F) -> F:
-        if name in CANONICAL_RUN_SPEC_FUNCS:
-            raise ValueError(f"A run spec function with name {name} already exists")
-        CANONICAL_RUN_SPEC_FUNCS[name] = func
-        return func
-    return wrap
-@run_spec_function("simple1")
-def get_simple1_spec() -> RunSpec:
-    """A run spec for debugging."""
-    return RunSpec(
-        name="simple1",
-        scenario_spec=get_scenario_spec1(),
-        adapter_spec=get_adapter_spec1(),
-        metric_specs=get_basic_metric_specs([]),
-        groups=[],
-    )
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.runner import get_benchmark_output_path
+from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
+from helm.common.hierarchical_logger import hlog, htrack
 @run_spec_function("bbq")
@@ -806,7 +59,9 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
         input_noun="Passage",
         output_noun="Answer",
     )
-    metric_specs = get_bbq_metric_specs()
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
+    ] + get_exact_match_metric_specs()
     return RunSpec(
         name=f"bbq:subject={subject},method={method}",
@@ -819,6 +74,8 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
 @run_spec_function("msmarco")
 def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
+    from helm.benchmark.scenarios.msmarco_scenario import MSMARCOScenario
     valid_topk = None if valid_topk is None else int(valid_topk)
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.msmarco_scenario.MSMARCOScenario",
@@ -827,11 +84,33 @@ def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
     adapter_spec: AdapterSpec = get_ranking_binary_adapter_spec(max_train_instances=4, stop_sequences=["\n"])
+    # Names of the measures we want to compute.
+    measure_names = MSMARCOScenario.MEASURE_NAMES[track]
+    multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
+    metric_specs = (
+        [
+            MetricSpec(
+                class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
+                args={
+                    "method": ADAPT_RANKING_BINARY,
+                    "measure_names": measure_names,
+                    "correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
+                    "wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
+                    "rank": valid_topk,
+                    "multiple_relevance_values": multiple_relevance_values,
+                },
+            ),
+        ]
+        + get_basic_reference_metric_specs()
+        + get_generic_metric_specs()
+    )
     return RunSpec(
         name=f"msmarco:track={track},valid_topk={valid_topk}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_msmarco_metric_specs(track=track, rank=valid_topk),
+        metric_specs=metric_specs,
         groups=[f"msmarco_{track}"],
     )
@@ -905,28 +184,6 @@ def get_custom_mcqa_spec(
     )
-@run_spec_function("mmlu")
-def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.mmlu_scenario.MMLUScenario", args={"subject": subject}
-    )
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=method,
-        instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
-        input_noun="Question",
-        output_noun="Answer",
-    )
-    return RunSpec(
-        name=f"mmlu:subject={subject},method={method}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=["mmlu"],
-    )
 @run_spec_function("interactive_qa_mmlu")
 def get_interactive_qa_mmlu_spec(subject: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -975,29 +232,6 @@ def get_wikifact_spec(k: str, subject: str) -> RunSpec:
     )
-@run_spec_function("commonsense")
-def get_commonsense_spec(dataset: str, method: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.commonsense_scenario.CommonSenseScenario",
-        args={"dataset": dataset},
-    )
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=method,
-        instructions="The following are multiple choice questions (with answers) about common sense.",
-        input_noun="Question",
-        output_noun="Answer",
-    )
-    return RunSpec(
-        name=f"commonsense:dataset={dataset},method={method}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=[dataset],
-    )
 @run_spec_function("quac")
 def get_quac_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.quac_scenario.QuACScenario", args={})
@@ -1060,7 +294,7 @@ def get_twitter_aae_spec(demographic: str) -> RunSpec:
         name=f"twitter_aae:demographic={demographic}",
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_basic_metric_specs([]),
+        metric_specs=get_language_modeling_metric_specs([]),
         groups=["twitter_aae", f"twitter_aae_{demographic}"],
     )
@@ -1088,7 +322,9 @@ def get_real_toxicity_prompts_spec() -> RunSpec:
         name="real_toxicity_prompts",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_generative_harms_metric_specs(include_basic_metrics=True),
+        metric_specs=get_generative_harms_metric_specs(
+            include_basic_metrics=True, include_generative_harms_metrics=True
+        ),
         groups=["real_toxicity_prompts"],
     )
@@ -1108,46 +344,28 @@ def get_synthetic_reasoning_natural_spec(difficulty: str) -> RunSpec:
         max_train_instances=3,  # limited by the context length
         max_tokens=20,
     )
+    srn_metric_specs = get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
     return RunSpec(
         name=f"synthetic_reasoning_natural:difficulty={difficulty}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_srn_metric_specs() + get_generative_harms_metric_specs(),
+        metric_specs=srn_metric_specs + get_generative_harms_metric_specs(),
         groups=["synthetic_reasoning", "synthetic_reasoning_natural"],
     )
-@run_spec_function("gsm")
-def get_gsm_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={})
-    # Create AdapterSpec based on the GSM8K paper: https://arxiv.org/pdf/2110.14168.pdf
-    adapter_spec = get_generation_adapter_spec(
-        input_noun="Q",
-        output_noun="A",
-        max_train_instances=5,  # Due to limited context and long example length
-        max_tokens=400,  # The paper uses 400 tokens as the max sample length
-        stop_sequences=["\n\n"],  # Since answer may contain newlines, we use two as SEP
-    )
-    return RunSpec(
-        name="gsm",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(),
-        groups=["gsm"],
-    )
 @run_spec_function("raft")
 def get_raft_spec(subset: str) -> RunSpec:
+    from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset}
     )
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name)
     adapter_spec = get_generation_adapter_spec(
-        instructions=get_raft_instructions(subset),
+        instructions=get_raft_instructions(subset, scenario_cache_path),
         input_noun=None,
         output_noun="Label",
         max_tokens=30,  # at most ~50 characters per label
@@ -1166,9 +384,10 @@ def get_raft_spec(subset: str) -> RunSpec:
 def get_numeracy_spec(
     relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
 ) -> RunSpec:
-    from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
+    from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
-    run_solver: bool = True if run_solver == "True" else False  # type: ignore
+    run_solver_bool: bool = True if run_solver == "True" else False
+    del run_solver
     random_seed = int(seed)
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
@@ -1208,72 +427,11 @@ def get_numeracy_spec(
         name=f"numeracy:relation_type={relation_type},mode={mode}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_numeracy_metric_specs(run_solver),  # type: ignore
+        metric_specs=get_numeracy_metric_specs(run_solver_bool),
         groups=["numeracy"],
     )
-@run_spec_function("math")
-def get_math_spec(
-    subject: str,
-    level: str,
-    use_official_examples: str = "False",
-    use_chain_of_thought: str = "False",
-) -> RunSpec:
-    use_official_examples: bool = use_official_examples == "True"  # type: ignore
-    use_chain_of_thought: bool = use_chain_of_thought == "True"  # type: ignore
-    if use_chain_of_thought:
-        assert not use_official_examples, "Cannot use official examples when use_chain_of_thought is True."
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.math_scenario.MATHScenario",
-        args={
-            "subject": subject,
-            "level": level,
-            "use_official_examples": use_official_examples,
-            "use_chain_of_thought": use_chain_of_thought,
-        },
-    )
-    if use_chain_of_thought:  # Include the solution in the output as per https://arxiv.org/abs/2201.11903
-        output_prefix = "Answer: "  # Don't include LaTeX '$' delimiters
-        output_suffix = "\n"
-        instance_prefix = "###\n"  # Don't include LaTeX '$' delimiters
-        max_tokens = 400  # Increase the number of tokens to generate
-        stop_sequences = ["###"]  # Break at the next instance; extraneous output will be stripped out
-        groups = ["math_chain_of_thought"]
-    else:
-        output_prefix = "Answer: $"
-        output_suffix = "$\n"
-        instance_prefix = "###\n"
-        max_tokens = 20
-        stop_sequences = ["$"]  # Break at the nearest LaTeX closing delimiter
-        groups = ["math_regular"]
-    adapter_spec = AdapterSpec(
-        method=ADAPT_GENERATION,
-        instructions="Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\n",
-        max_train_instances=8,
-        num_outputs=1,
-        temperature=0.0,
-        stop_sequences=stop_sequences,
-        max_tokens=max_tokens,
-        input_prefix="Problem: ",
-        input_suffix="\n",
-        output_prefix=output_prefix,
-        output_suffix=output_suffix,
-        instance_prefix=instance_prefix,
-    )
-    return RunSpec(
-        name=f"math:subject={subject},level={level},"
-        f"use_official_examples={use_official_examples},use_chain_of_thought={use_chain_of_thought}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_math_metric_specs(use_chain_of_thought) + get_generative_harms_metric_specs(),  # type: ignore
-        groups=groups,
-    )
 @run_spec_function("boolq")
 def get_boolq_spec(only_contrast=False) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -1358,6 +516,8 @@ def get_copyright_spec(
     normalize_by_prefix_length=True,
     normalize_newline_space_tab=False,
 ) -> RunSpec:
+    from helm.benchmark.scenarios.copyright_scenario import datatag2hash_code
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.copyright_scenario.CopyrightScenario", args=dict(datatag=datatag)
     )
@@ -1470,36 +630,21 @@ def get_code_spec(dataset: str, timeout=3) -> RunSpec:
             max_tokens=600,
         )
+    if dataset == "humaneval":
+        code_metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"])
+    else:  # APPS.
+        args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
+        code_metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
     return RunSpec(
         name=f"code:dataset={dataset}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_code_metric_specs(dataset, timeout) + get_generative_harms_metric_specs(),
+        metric_specs=code_metric_specs + get_generative_harms_metric_specs(),
         groups=[f"code_{dataset}"],
     )
-@run_spec_function("natural_qa")
-def get_natural_qa_spec(mode: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={"mode": mode}
-    )
-    adapter_spec = get_generation_adapter_spec(
-        input_noun="Question" if mode == "closedbook" else None,
-        output_noun="Answer",
-        max_tokens=300,  # answers are at most 65 words
-    )
-    return RunSpec(
-        name=f"natural_qa:mode={mode}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_f1_metric_specs() + get_generative_harms_metric_specs(),
-        groups=[f"natural_qa_{mode}"],
-    )
 @run_spec_function("the_pile")
 def get_the_pile_spec(subset: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -1510,7 +655,7 @@ def get_the_pile_spec(subset: str) -> RunSpec:
         name=f"the_pile:subset={subset}",
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_basic_metric_specs([]),
+        metric_specs=get_language_modeling_metric_specs([]),
         groups=["the_pile"],
     )
@@ -1523,32 +668,11 @@ def get_ice_spec(**kwargs) -> RunSpec:
         name="ice" + (":" if len(kwargs) > 0 else "") + ",".join(f"{k}={v}" for k, v in sorted(kwargs.items())),
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_basic_metric_specs([]),
+        metric_specs=get_language_modeling_metric_specs([]),
         groups=["ice"],
     )
-@run_spec_function("narrative_qa")
-def get_narrativeqa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario", args={}
-    )
-    adapter_spec = get_generation_adapter_spec(
-        input_noun="Passage",
-        output_noun="Answer",
-        max_tokens=100,  # max 30 words
-    )
-    return RunSpec(
-        name="narrative_qa",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
-        groups=["narrative_qa"],
-    )
 @run_spec_function("synthetic_efficiency")
 def get_synthetic_efficiency_spec(
     num_prompt_tokens: Optional[int] = None,
@@ -1570,7 +694,9 @@ def get_synthetic_efficiency_spec(
         name=f"synthetic_efficiency:random={random}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_basic_metric_specs(["exact_match"]) + get_generative_harms_metric_specs(),
+        metric_specs=get_basic_generation_metric_specs(["exact_match"])
+        + get_generic_metric_specs()
+        + get_generative_harms_metric_specs(),
         groups=["synthetic_efficiency"],
     )
@@ -1609,7 +735,7 @@ def get_wikitext_103_spec() -> RunSpec:
         name="wikitext_103",
         scenario_spec=scenario_spec,
         adapter_spec=get_language_modeling_adapter_spec(),
-        metric_specs=get_basic_metric_specs([]),
+        metric_specs=get_language_modeling_metric_specs([]),
         groups=["wikitext_103"],
     )
@@ -1757,7 +883,9 @@ def get_dyck_language_spec(num_parenthesis_pairs: int) -> RunSpec:
         name=f"dyck_language_np={int(num_parenthesis_pairs)}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(),
+        metric_specs=get_basic_generation_metric_specs(["exact_match_indicator"])
+        + get_generic_metric_specs()
+        + get_generative_harms_metric_specs(),
         groups=["dyck_language"],
     )
@@ -1827,6 +955,8 @@ def get_entity_data_imputation_spec(dataset: str) -> RunSpec:
 @htrack("Extracting adaptation parameters from the BIG-bench task definition and building the RunSpec")
 @run_spec_function("big_bench")
 def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
+    from helm.benchmark.scenarios.big_bench_scenario import BIGBenchScenario
     def get_adaptation_method(big_bench_metrics: List[str]) -> str:
         """
         From BIG-bench, "there are three types of BIG-bench JSON tasks - generative and scoring
@@ -1871,16 +1001,14 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
     )
     # Get BIG-bench task definition.
-    # TODO: get `output_path` here without hardcoding
-    output_path: str = "benchmark_output/scenarios/big_bench"
-    big_bench_task: Dict = BIGBenchScenario.download_and_get_task(output_path, task, subtask)
+    scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), BIGBenchScenario.name)
+    big_bench_task: Dict = BIGBenchScenario.download_and_get_task(scenario_cache_path, task, subtask)
     # The JSON schema for BIG-bench can be found here:
     # https://github.com/google/BIG-bench/blob/main/docs/doc.md#json-schema.
     # "metrics" is a required field. The default values were populated using the link above.
     adapter_spec = AdapterSpec(
         method=get_adaptation_method(big_bench_task["metrics"]),
-        model="openai/text-curie-001",  # Can override with the `ModelRunExpander`.
         max_train_instances=5,  # Can override with the `MaxTrainInstancesRunExpander`.
         num_outputs=1,  # Can override with the `NumOutputsRunExpander`.
         # From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
@@ -1907,9 +1035,8 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
         name=run_spec_name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        # TODO add generative harms when applicable
         metric_specs=get_metric_specs(big_bench_task["metrics"]),
-        groups=["BIG-bench"],
+        groups=[f"big_bench_{task}"],
     )
@@ -1991,7 +1118,7 @@ def get_med_mcqa_spec() -> RunSpec:
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["MedMCQA"],
+        groups=["med_mcqa"],
     )
@@ -2017,48 +1144,86 @@ def get_med_paragraph_simplification_spec() -> RunSpec:
     )
-@run_spec_function("med_qa")
-def get_med_qa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_qa_scenario.MedQAScenario", args={})
+@run_spec_function("pubmed_qa")
+def get_pubmed_qa_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
     adapter_spec = get_multiple_choice_adapter_spec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions="Give a letter answer among A, B, C or D.",
+        instructions="Answer A for yes, B for no or C for maybe.",
         input_noun="Question",
         output_noun="Answer",
     )
     return RunSpec(
-        name="med_qa",
+        name="pubmed_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["MedQA"],
+        groups=["pubmed_qa"],
     )
-@run_spec_function("pubmed_qa")
-def get_pubmed_qa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
+@run_spec_function("live_qa")
+def get_live_qa_spec() -> RunSpec:
+    from helm.common.gpu_utils import get_torch_device_name
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions="Answer A for yes, B for no or C for maybe.",
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Please answer the following consumer health question.",
         input_noun="Question",
         output_noun="Answer",
+        max_train_instances=0,
+        max_tokens=512,
     )
     return RunSpec(
-        name="pubmed_qa",
+        name="live_qa",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=["pubmed_qa"],
+        metric_specs=get_summarization_metric_specs(
+            {"task": "live_qa", "device": get_torch_device_name()},
+        ),
+        groups=["live_qa"],
+    )
+@run_spec_function("medication_qa")
+def get_medication_qa_spec() -> RunSpec:
+    from helm.common.gpu_utils import get_torch_device_name
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Please answer the following consumer health question.",
+        input_noun="Question",
+        output_noun="Answer",
+        max_train_instances=0,
+        max_tokens=512,
+    )
+    return RunSpec(
+        name="medication_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_summarization_metric_specs(
+            {"task": "medication_qa", "device": get_torch_device_name()},
+        ),
+        groups=["medication_qa"],
     )
 @run_spec_function("lextreme")
 def get_lextreme_spec(subset: str) -> RunSpec:
+    from helm.benchmark.scenarios.lextreme_scenario import (
+        get_lextreme_instructions,
+        get_lextreme_max_train_instances,
+        get_lextreme_max_tokens,
+        TaskType,
+        get_lextreme_task_type,
+    )
     task_type = get_lextreme_task_type(subset)
     scenario_spec = ScenarioSpec(
@@ -2075,7 +1240,7 @@ def get_lextreme_spec(subset: str) -> RunSpec:
         multi_label=(task_type == TaskType.MLTC),
     )
-    metric_specs = get_basic_metric_specs([])
+    metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
     if task_type == TaskType.MLTC:
         metric_specs += get_classification_metric_specs(delimiter=", ")
     elif task_type == TaskType.SLTC:
@@ -2092,6 +1257,14 @@ def get_lextreme_spec(subset: str) -> RunSpec:
 @run_spec_function("lex_glue")
 def get_lex_glue_spec(subset: str) -> RunSpec:
+    from helm.benchmark.scenarios.lex_glue_scenario import (
+        get_lex_glue_instructions,
+        get_lex_glue_max_tokens,
+        get_lex_glue_max_train_instances,
+        get_lex_glue_task_type,
+    )
+    from helm.benchmark.scenarios.lextreme_scenario import TaskType
     task_type = get_lex_glue_task_type(subset)
     scenario_spec = ScenarioSpec(
@@ -2108,7 +1281,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec:
         multi_label=(task_type == TaskType.MLTC),
     )
-    metric_specs = get_basic_metric_specs([])
+    metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
     if task_type == TaskType.MLTC:
         metric_specs += get_classification_metric_specs(delimiter=", ")
     elif task_type == TaskType.SLTC:
@@ -2207,92 +1380,6 @@ def get_eurlexsum_legal_summarization_spec(temperature: float = 0.3, device: str
     )
-@run_spec_function("wmt_14")
-def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec:
-    FULL_LANGUAGE_NAMES = {
-        "cs": "Czech",
-        "de": "German",
-        "fr": "French",
-        "hi": "Hindi",
-        "ru": "Russian",
-        "en": "English",
-    }
-    source_language, target_language = language_pair.split("-")
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario",
-        args={"source_language": source_language, "target_language": target_language},
-    )
-    adapter_spec = get_machine_translation_adapter_spec(
-        source_language=FULL_LANGUAGE_NAMES[source_language],
-        target_language=FULL_LANGUAGE_NAMES[target_language],
-        max_train_instances=max_train_instances,
-    )
-    return RunSpec(
-        name=f"wmt_14:language_pair={language_pair}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_machine_translation_metric_specs(),
-        groups=["wmt_14"],
-    )
-@run_spec_function("self_instruct")
-def get_self_instruct_spec(num_respondents: int) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.self_instruct_scenario.SelfInstructScenario",
-        args={},
-    )
-    adapter_spec = get_instruct_adapter_spec()
-    return RunSpec(
-        name="self_instruct",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
-        groups=["self_instruct"],
-    )
-@run_spec_function("vicuna")
-def get_vicuna_spec(num_respondents: int, category: str = "all") -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vicuna_scenario.VicunaScenario",
-        args={"category": category},
-    )
-    adapter_spec = get_instruct_adapter_spec()
-    return RunSpec(
-        name=f"vicuna:category={category}",  # TODO: add args
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
-        groups=["vicuna"],
-    )
-@run_spec_function("grammar")
-def get_grammar_spec(num_respondents: int, path: str, tags: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.grammar_scenario.GrammarScenario",
-        args={"path": path, "tags": tags},
-    )
-    adapter_spec = get_instruct_adapter_spec()
-    return RunSpec(
-        name=f"grammar:path={path},tags={tags}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
-        groups=["grammar"],
-    )
 @run_spec_function("verifiability_judgment")
 def get_verifiability_judgment_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -2315,7 +1402,7 @@ def get_verifiability_judgment_spec() -> RunSpec:
         name="verifiability_judgment",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_verifiability_judgment_metric_specs(),
+        metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match"]),
         groups=["verifiability_judgment"],
     )
@@ -2355,269 +1442,69 @@ def get_opinions_qa_spec(
     )
-@run_spec_function("open_assistant")
-def get_open_assistant_spec(num_respondents: int, language: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.open_assistant_scenario.OpenAssistantScenario",
-        args={"language": language},
-    )
-    adapter_spec = get_instruct_adapter_spec()
-    return RunSpec(
-        name=f"open_assistant:language={language}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
-        groups=["open_assistant"],
-    )
-@run_spec_function("koala")
-def get_koala_spec(num_respondents: int) -> RunSpec:
+@run_spec_function("lm_entry")
+def get_lm_entry_spec(task: str, method: str = ADAPT_GENERATION) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.koala_scenario.KoalaScenario",
-        args={},
+        class_name="helm.benchmark.scenarios.lm_entry_scenario.LMEntryScenario",
+        args={"task": task},
     )
+    adapter_spec: AdapterSpec
+    metric_specs: List[MetricSpec]
-    adapter_spec = get_instruct_adapter_spec()
+    if method == ADAPT_MULTIPLE_CHOICE_JOINT:
+        if task in ["first_letter", "last_letter", "first_word", "last_word", "word_before", "word_after"]:
+            raise ValueError(f"Task {task} cannot be cast to multiple choice.")
+        adapter_spec = get_multiple_choice_adapter_spec(
+            method=method,
+            instructions="Answer the following multiple choice question with a single letter",
+            input_noun="Question",
+            output_noun="\nAnswer",
+        )
+        metric_specs = get_exact_match_metric_specs()
+    elif method == ADAPT_GENERATION:
+        adapter_spec = get_generation_adapter_spec(
+            instructions="Answer the following question in one word.",
+            input_noun="Q",
+            output_noun="\nA",
+            # Shouldn't use any stop sequences because the task is zero-shot and thus we
+            # don't expect the model to magically figure out the output format.
+            stop_sequences=[],
+            # Set max_tokens to save tokens. The answer is a word so 10 tokens should suffice.
+            max_tokens=10,
+        )
+        # It makes no sense to include non-quasi exact match metrics for this task.
+        metric_specs = get_basic_metric_specs(["quasi_exact_match", "quasi_prefix_exact_match", "f1_score"])
+    else:
+        raise ValueError(f"Unknown method: {method}")
     return RunSpec(
-        name="koala",
+        name=f"lm_entry:task={task},method={method}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
-        groups=["koala"],
+        metric_specs=metric_specs,
+        groups=["lm_entry"],
     )
-@run_spec_function("anthropic_hh_rlhf")
-def get_anthropic_hh_rlhf_spec(num_respondents: int, subset: str) -> RunSpec:
+@run_spec_function("thai_exam")
+def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.anthropic_hh_rlhf_scenario.AnthropicHHRLHFScenario",
-        args={"subset": subset},
+        class_name="helm.benchmark.scenarios.thai_exam_scenario.ThaiExamScenario", args={"exam": exam}
     )
-    adapter_spec = get_instruct_adapter_spec()
-    return RunSpec(
-        name=f"anthropic_hh_rlhf:subset={subset}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
-        groups=["anthropic_hh_rlhf"],
-    )
-@run_spec_function("cleva")
-def get_cleva_spec(task: str, version: str, subtask: Optional[str] = None, prompt_id: int = 0) -> RunSpec:
-    from .scenarios.cleva_scenario import CLEVAScenario  # noqa
-    CLEVAScenario.download_dataset(task, version)
-    _, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id)
-    inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id)
-    class_name_prefix = "".join([word.capitalize() for word in task.split("_")])
-    scenario_spec = ScenarioSpec(
-        class_name=f"helm.benchmark.scenarios.cleva_scenario.CLEVA{class_name_prefix}Scenario",
-        args={"version": version, "subtask": subtask, "prompt_id": prompt_id},
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="The following are multiple choice questions (with answers).",
+        input_noun="Question",
+        output_noun="Answer",
+        max_train_instances=5,
     )
-    run_spec_name: str = f"cleva:task={task},version={version},prompt_id={prompt_id}"
-    if subtask:
-        run_spec_name += f",subtask={subtask}"
-    if task in ["copyright"]:
-        adapter_spec = get_completion_adapter_spec(
-            temperature=inference_parameters.get("temperature", 0.2),
-            max_tokens=inference_parameters.get("max_tokens", 1024),
-            num_outputs=inference_parameters.get("num_outputs", 1),
-        )
-        args = {"normalize_by_prefix_length": True, "normalize_newline_space_tab": False}
-        metric_specs = get_cleva_copyright_metric_spec(args) + get_cleva_generative_harms_metric_specs()
-    elif task in ["code_synthesis"]:
-        adapter_spec = get_completion_adapter_spec(
-            instructions=prompt_setting.instructions,
-            temperature=inference_parameters.get("temperature", 0.2),
-            # Taken from the original OpenAI paper to prevent the further generation of irrelevant classes/functions
-            stop_sequences=inference_parameters.get("stop_sequences", ["\nclass", "\ndef", "\nif", "\nprint"]),
-            max_tokens=inference_parameters.get("max_tokens", 600),
-        )
-        metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"]) + get_cleva_generative_harms_metric_specs()
-    elif task in ["language_modeling"]:
-        adapter_spec = get_language_modeling_adapter_spec()
-        metric_specs = get_basic_metric_specs([])
-    else:
-        if prompt_setting.method in [
-            ADAPT_MULTIPLE_CHOICE_JOINT,
-            ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
-            ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
-        ]:
-            if prompt_setting.method == ADAPT_MULTIPLE_CHOICE_JOINT:
-                adapter_spec = AdapterSpec(
-                    method=prompt_setting.method,
-                    instructions=prompt_setting.instructions,
-                    input_prefix=prompt_setting.input_prefix,
-                    input_suffix=prompt_setting.input_suffix,
-                    output_prefix=prompt_setting.output_prefix,
-                    output_suffix=prompt_setting.output_suffix,
-                    max_train_instances=inference_parameters.get("max_train_instances", 5),
-                    num_outputs=inference_parameters.get("num_outputs", 5),
-                    max_tokens=inference_parameters.get("max_tokens", 1),
-                    temperature=inference_parameters.get("temperature", 0.0),
-                    stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
-                    sample_train=inference_parameters.get("sample_train", True),
-                    multi_label=inference_parameters.get("multi_label", False),
-                )
-            else:
-                adapter_spec = AdapterSpec(
-                    method=prompt_setting.method,
-                    instructions=prompt_setting.instructions,
-                    input_prefix=prompt_setting.input_prefix,
-                    input_suffix=prompt_setting.input_suffix,
-                    output_prefix=prompt_setting.output_prefix,
-                    output_suffix=prompt_setting.output_suffix,
-                    # Separate is basically language modeling, so can't easily use in-context examples
-                    max_train_instances=inference_parameters.get("max_train_instances", 5),
-                    num_outputs=1,
-                    max_tokens=0,
-                    temperature=inference_parameters.get("temperature", 0.0),
-                    sample_train=inference_parameters.get("sample_train", True),
-                )
-            metric_specs = get_exact_match_metric_specs()
-            if task in ["fact_checking", "bias"]:
-                metric_specs += get_multiple_choice_classification_metric_specs()
-        elif prompt_setting.method == ADAPT_GENERATION:
-            adapter_spec = AdapterSpec(
-                method=prompt_setting.method,
-                instructions=prompt_setting.instructions,
-                input_prefix=prompt_setting.input_prefix,
-                input_suffix=prompt_setting.input_suffix,
-                output_prefix=prompt_setting.output_prefix,
-                output_suffix=prompt_setting.output_suffix,
-                max_train_instances=inference_parameters.get("max_train_instances", 5),
-                num_outputs=inference_parameters.get("num_outputs", 1),
-                max_tokens=inference_parameters.get("max_tokens", 20),
-                temperature=inference_parameters.get("temperature", 0.0),
-                stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
-                sample_train=inference_parameters.get("sample_train", True),
-                multi_label=inference_parameters.get("multi_label", True),
-            )
-            metric_specs = (
-                get_cleva_generative_task_metric_spec(task, subtask) + get_cleva_generative_harms_metric_specs()
-            )
-        else:
-            raise ValueError(
-                f"{task} can only be {ADAPT_GENERATION}, {ADAPT_MULTIPLE_CHOICE_JOINT}, "
-                f"{ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED} or {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL}"
-            )
     return RunSpec(
-        name=run_spec_name,
+        name=f"thai_exam:exam={exam},method={method}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
-        groups=["cleva", f"cleva_{task}"],
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["thai_exam"],
     )
-############################################################
-def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
-    """
-    Takes a specification (name, args) and returns a list of `RunSpec`s.
-    """
-    # Note that we are abusing `spec` a bit because the name is not actually a class name.
-    name = spec.class_name
-    args = spec.args
-    if name not in CANONICAL_RUN_SPEC_FUNCS:
-        raise ValueError(f"Unknown run spec name: {name}")
-    # Peel off the run expanders (e.g., model)
-    expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS]  # type: ignore
-    args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
-    # Get the canonical run specs
-    run_specs = [CANONICAL_RUN_SPEC_FUNCS[name](**args)]
-    # Apply expanders
-    for expander in expanders:
-        run_specs = [
-            child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
-        ]
-    def alter_run_spec(run_spec: RunSpec) -> RunSpec:
-        try:
-            model = get_model(run_spec.adapter_spec.model)
-        except ValueError:
-            # Models registered from configs cannot have expanders applied to them,
-            # because the models will not have been registered yet at this point.
-            # TODO: Figure out a cleaner way to deal with this.
-            return run_spec
-        # For models that strip newlines, when we're generating, we need to set
-        # the delimiter to be '###' so we stop properly.
-        if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
-            ADAPT_GENERATION,
-            ADAPT_MULTIPLE_CHOICE_JOINT,
-        ):
-            stop_expander = StopRunExpander(value="hash")
-            run_spec = singleton(stop_expander.expand(run_spec))
-        if NLG_PREFIX_TAG in model.tags:
-            global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
-            run_spec = singleton(global_prefix_expander.expand(run_spec))
-        # When running ChatGPT on non-language modelling tasks, increase max_tokens by 1
-        # to add room for the special message role token.
-        if OPENAI_CHATGPT_MODEL_TAG in model.tags and run_spec.adapter_spec.max_tokens:
-            increase_max_tokens_expander = IncreaseMaxTokensRunExpander(value=1)
-            run_spec = singleton(increase_max_tokens_expander.expand(run_spec))
-        if CHATML_MODEL_TAG in model.tags:
-            chatml_expander = ChatMLRunExpander()
-            run_spec = singleton(chatml_expander.expand(run_spec))
-        # Special handling for Anthropic Claude
-        if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
-            try:
-                import anthropic
-                from helm.proxy.clients.anthropic_client import AnthropicClient
-            except ModuleNotFoundError as e:
-                handle_module_not_found_error(e, ["anthropic"])
-            claude_run_expanders: List[RunExpander] = []
-            claude_run_expanders.append(AddToStopRunExpander(anthropic.HUMAN_PROMPT))
-            if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags:
-                claude_run_expanders.append(IncreaseMaxTokensRunExpander(value=AnthropicClient.ADDITIONAL_TOKENS))
-            # Get scenario tags
-            components = run_spec.scenario_spec.class_name.split(".")
-            class_name = components[-1]
-            module_name = ".".join(components[:-1])
-            cls = getattr(importlib.import_module(module_name), class_name)
-            scenario_tags: List[str] = cls.tags
-            # If the scenario is instruction, do not use PROMPT_ANSWER_START
-            if "instructions" in scenario_tags:
-                claude_run_expanders.append(
-                    FormatPromptRunExpander(prefix=anthropic.HUMAN_PROMPT, suffix=f"{anthropic.AI_PROMPT}")
-                )
-            else:
-                claude_run_expanders.append(
-                    FormatPromptRunExpander(
-                        prefix=anthropic.HUMAN_PROMPT,
-                        suffix=f"{anthropic.AI_PROMPT} {AnthropicClient.PROMPT_ANSWER_START}",
-                    )
-                )
-            for claude_run_expander in claude_run_expanders:
-                run_spec = singleton(claude_run_expander.expand(run_spec))
-        # For multiple choice
-        if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
-            increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
-            run_spec = singleton(increase_temperature_expander.expand(run_spec))
-        return run_spec
-    run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
-    return run_specs

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl