PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/run_specs/capabilities_run_specs.py ADDED Viewed

@@ -0,0 +1,308 @@
+"""Run spec functions for the HELM Capabilities leaderboard.
+Website: https://crfm.stanford.edu/helm/capabilities/"""
+from helm.benchmark.adaptation.adapter_spec import (
+    ADAPT_GENERATION,
+    ADAPT_CHAT,
+    ADAPT_MULTIPLE_CHOICE_JOINT,
+    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+    AdapterSpec,
+)
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+    get_exact_match_metric_specs,
+)
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+def _bool_to_str(value: bool):
+    return str(value).lower()
+@run_spec_function("mmlu_pro")
+def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
+    use_few_shot_bool: bool = use_few_shot.lower() == "true"
+    del use_chain_of_thought
+    del use_few_shot
+    run_spec_name = f"mmlu_pro:subset={subject},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}"  # noqa: E501
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
+    )
+    max_train_instance_num = 5 if use_few_shot_bool else 0
+    if use_chain_of_thought_bool:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=4096,  # original: 4000
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=(
+                "Let’s think step by step. Based on your reasoning, what is the single, "
+                "most likely answer choice? Format your response as follows: "
+                '"The correct answer is (insert answer here)".'
+            ),
+        )
+        return RunSpec(
+            name=run_spec_name,
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_basic_metric_specs([])
+            + [
+                MetricSpec(
+                    class_name="helm.benchmark.metrics.gpqa_chain_of_thought_metric.GPQAChainOfThoughtMetric", args={}
+                ),
+            ],
+            groups=["mmlu_pro"],
+        )
+    else:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=4096,  # original: 4000
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
+        )
+        return RunSpec(
+            name=run_spec_name,
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs(),
+            groups=["mmlu_pro"],
+        )
+@run_spec_function("gpqa")
+def get_gpqa_spec(subset: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
+    use_few_shot_bool: bool = use_few_shot.lower() == "true"
+    del use_chain_of_thought
+    del use_few_shot
+    if not subset.startswith("gpqa_"):
+        subset = "gpqa_" + subset
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset}
+    )
+    max_train_instance_num = 5 if use_few_shot_bool else 0
+    if use_few_shot_bool:
+        if use_chain_of_thought_bool:
+            adapter_spec = get_multiple_choice_adapter_spec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+                max_tokens=2000,  # original: 1000
+                max_train_instances=max_train_instance_num,
+                instructions=(
+                    "Here are some example questions from experts. "
+                    "An explanation is given before the final answer. "
+                    "Answer the final question yourself, giving your reasoning beforehand."
+                ),
+                input_noun="Question",
+                input_suffix="\nChoices: \n",
+                reference_prefix="(A) ",
+                chain_of_thought_prefix="Let's think step by step: ",
+                chain_of_thought_suffix="The correct answer is ",
+                output_noun="",  # will be overwritten with output_prefix
+                output_prefix="",
+                global_suffix=(
+                    "Give step by step reasoning before you answer, and when you’re ready to answer, "
+                    'please use the format "The correct answer is (insert answer here)":'
+                ),
+            )
+        else:
+            adapter_spec = get_multiple_choice_adapter_spec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT,
+                max_train_instances=max_train_instance_num,
+                instructions=(
+                    "Here are some example questions from experts. "
+                    "An explanation is given before the final answer. "
+                    "Answer the final question yourself, giving your reasoning beforehand."
+                ),
+                input_noun="Question",
+                input_suffix="\nChoices: \n",
+                reference_prefix="(A) ",
+                output_noun="",  # will be overwritten with output_prefix
+                output_prefix="The correct answer is ",
+            )
+    else:
+        if use_chain_of_thought_bool:
+            adapter_spec = AdapterSpec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+                max_train_instances=max_train_instance_num,
+                max_tokens=4096,  # original: 1000
+                input_prefix="What is the correct answer to this question: ",
+                input_suffix="\nChoices:\n",
+                output_prefix="",
+                reference_prefix="(A) ",
+                global_suffix=(
+                    "Let’s think step by step. Based on your reasoning, what is the single, "
+                    "most likely answer choice? Format your response as follows: "
+                    '"The correct answer is (insert answer here)".'
+                ),
+            )
+        else:
+            adapter_spec = AdapterSpec(
+                method=ADAPT_MULTIPLE_CHOICE_JOINT,
+                max_train_instances=max_train_instance_num,
+                max_tokens=4096,  # original: 1000
+                input_prefix="What is the correct answer to this question: ",
+                input_suffix="\nChoices:\n",
+                output_prefix="",
+                reference_prefix="(A) ",
+                global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
+            )
+    metric_specs = (
+        (
+            get_basic_metric_specs([])
+            + [
+                MetricSpec(
+                    class_name="helm.benchmark.metrics.gpqa_chain_of_thought_metric.GPQAChainOfThoughtMetric", args={}
+                ),
+            ]
+        )
+        if use_chain_of_thought_bool
+        else get_exact_match_metric_specs()
+    )
+    return RunSpec(
+        name=f"gpqa:subset={subset},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}",  # noqa: E501
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["gpqa"],
+    )
+@run_spec_function("ifeval")
+def get_ifeval_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ifeval_scenario.IFEvalScenario")
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        input_prefix="",
+        output_prefix="",
+        max_tokens=4096,  # Unknown number from paper
+        num_outputs=1,
+        temperature=0.0,
+    )
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(class_name="helm.benchmark.metrics.ifeval_metrics.IFEvalMetric")
+    ]
+    return RunSpec(
+        name="ifeval",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["ifeval"],
+    )
+@run_spec_function("wildbench")
+def get_wildbench_spec(subset: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario",
+        args={
+            "subset": subset,
+            "use_model_outputs": False,
+        },
+    )
+    adapter_spec = AdapterSpec(
+        method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.wildbench_annotator.WildBenchAnnotator")]
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(class_name="helm.benchmark.metrics.wildbench_metrics.WildBenchScoreMetric")
+    ]
+    return RunSpec(
+        name=f"wildbench:subset={subset}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["wildbench"],
+    )
+# TODO: Remove BigCodeBench from capabilities_run_specs.py because it is no longer part of HELM Capabilities
+@run_spec_function("bigcodebench")
+def get_bigcodebench_spec(version: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version}
+    )
+    # Adapted from https://github.dev/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        input_prefix="",
+        output_prefix="",
+        max_tokens=4096,  # original: 1280
+        num_outputs=1,
+        temperature=0.0,
+        global_prefix="Please provide a self-contained Python script "
+        "that solves the following problem in a markdown code block:",
+    )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.bigcodebench_annotator.BigCodeBenchAnnotator")
+    ]
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(class_name="helm.benchmark.metrics.bigcodebench_metrics.BigCodeBenchMetric")
+    ]
+    return RunSpec(
+        name=f"bigcodebench:version={version}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["bigcodebench"],
+    )
+@run_spec_function("omni_math")
+def get_omni_math_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario")
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        instructions="Answer the question, giving your reasoning beforehand. Wrap the final answer with the \\boxed{} command.",  # noqa: E501
+        input_prefix="",
+        output_prefix="",
+        max_tokens=4096,  # original: 2048
+        num_outputs=1,
+        temperature=0.0,
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.omni_math_annotator.OmniMATHAnnotator")]
+    metric_specs = get_basic_metric_specs([]) + [
+        MetricSpec(class_name="helm.benchmark.metrics.omni_math_metrics.OmniMATHMetric")
+    ]
+    return RunSpec(
+        name="omni_math",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=metric_specs,
+        groups=["omni_math"],
+    )

helm/benchmark/run_specs/classic_run_specs.py CHANGED Viewed

@@ -387,7 +387,7 @@ def get_numeracy_spec(
 ) -> RunSpec:
     from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
-    run_solver_bool: bool = True if run_solver == "True" else False
+    run_solver_bool: bool = True if run_solver.lower() == "true" else False
     del run_solver
     random_seed = int(seed)
     scenario_spec = ScenarioSpec(
@@ -1082,27 +1082,6 @@ def get_me_q_sum_spec() -> RunSpec:
     )
-@run_spec_function("med_dialog")
-def get_med_dialog_spec(subset: str) -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.med_dialog_scenario.MedDialogScenario", args={"subset": subset}
-    )
-    adapter_spec = get_summarization_adapter_spec(
-        num_sents=1,
-        max_tokens=128,
-        temperature=0.3,
-    )
-    return RunSpec(
-        name=f"med_dialog,subset={subset}",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
-        groups=["MedDialog"],
-    )
 @run_spec_function("med_mcqa")
 def get_med_mcqa_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.med_mcqa_scenario.MedMCQAScenario", args={})
@@ -1145,26 +1124,6 @@ def get_med_paragraph_simplification_spec() -> RunSpec:
     )
-@run_spec_function("pubmed_qa")
-def get_pubmed_qa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
-    adapter_spec = get_multiple_choice_adapter_spec(
-        method=ADAPT_MULTIPLE_CHOICE_JOINT,
-        instructions="Answer A for yes, B for no or C for maybe.",
-        input_noun="Question",
-        output_noun="Answer",
-    )
-    return RunSpec(
-        name="pubmed_qa",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=get_exact_match_metric_specs(),
-        groups=["pubmed_qa"],
-    )
 @run_spec_function("live_qa")
 def get_live_qa_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
@@ -1191,33 +1150,6 @@ def get_live_qa_spec() -> RunSpec:
     )
-@run_spec_function("medication_qa")
-def get_medication_qa_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
-    adapter_spec = get_generation_adapter_spec(
-        instructions="Please answer the following consumer health question.",
-        input_noun="Question",
-        output_noun="Answer",
-        max_train_instances=0,
-        max_tokens=512,
-    )
-    annotator_specs = [
-        AnnotatorSpec(class_name="helm.benchmark.annotation.medication_qa_annotator.MedicationQAAnnotator")
-    ]
-    metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.medication_qa_metrics.MedicationQAScoreMetric")]
-    return RunSpec(
-        name="medication_qa",
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        annotators=annotator_specs,
-        metric_specs=metric_specs,
-        groups=["medication_qa"],
-    )
 @run_spec_function("lextreme")
 def get_lextreme_spec(subset: str) -> RunSpec:
     from helm.benchmark.scenarios.lextreme_scenario import (

helm/benchmark/run_specs/enem_challenge_specs.py ADDED Viewed

@@ -0,0 +1,31 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("enem_challenge")
+def get_enem_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.enem_challenge_scenario.ENEMChallengeScenario", args={}
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
+        "Se as opções forem A, B, C, D e E, "
+        "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
+        "Exemplo: Qual é a capital da França?\nA. Londres\nB. Paris\nC. Roma\nD. Berlim\nE. Sydney\n"
+        "Resposta: B",
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+    return RunSpec(
+        name="enem_challenge",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["enem_challenge"],
+    )

helm/benchmark/run_specs/enterprise_run_specs.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Run spec functions for HELM Enterprise scenarios."""
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import (
+    get_generation_adapter_spec,
+    get_multiple_choice_adapter_spec,
+)
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+    get_exact_match_metric_specs,
+)
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+def _get_weighted_classification_metric_specs(labels: List[str]) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
+            args={"averages": ["weighted"], "scores": ["f1", "precision", "recall"], "labels": labels},
+        )
+    ]
+# Finance
+@run_spec_function("gold_commodity_news")
+def get_news_headline_spec(category: str) -> RunSpec:
+    from helm.benchmark.scenarios.gold_commodity_news_scenario import GoldCommodityNewsScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.gold_commodity_news_scenario.GoldCommodityNewsScenario",
+        args={"category": category},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=GoldCommodityNewsScenario.get_instructions(category), input_noun="Headline", output_noun="Answer"
+    )
+    return RunSpec(
+        name=f"gold_commodity_news:category={category}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
+        groups=["gold_commodity_news"],
+    )
+@run_spec_function("financial_phrasebank")
+def get_financial_phrasebank_spec(agreement: int = 50) -> RunSpec:
+    from helm.benchmark.scenarios.financial_phrasebank_scenario import FinancialPhrasebankScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.financial_phrasebank_scenario.FinancialPhrasebankScenario",
+        args={"agreement": agreement},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=FinancialPhrasebankScenario.INSTRUCTIONS,
+        input_noun="Sentence",
+        output_noun="Label",
+        max_tokens=30,
+    )
+    return RunSpec(
+        name=f"financial_phrasebank:agreement={agreement}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs()
+        + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
+        groups=["financial_phrasebank"],
+    )
+@run_spec_function("conv_fin_qa_calc")
+def get_conv_fin_qa_calc_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.conv_fin_qa_calc_scenario.ConvFinQACalcScenario", args={}
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Based on the table, answer the final question. Respond with the answer only, with no additional explanation.",  # noqa: E501
+        input_noun=None,
+        output_noun="Answer",
+    )
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.conv_fin_qa_calc_metrics.ConvFinQACalcMetric")
+    ] + get_basic_metric_specs([])
+    return RunSpec(
+        name="conv_fin_qa_calc",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["conv_fin_qa_calc"],
+    )
+# Legal
+@run_spec_function("legal_contract_summarization")
+def get_legal_contract_summarization_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario",
+        args={},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Summarize the legal document in plain English.",
+        input_noun="Document",
+        output_noun="Summary",
+        max_tokens=100,
+        stop_sequences=["\n\n"],
+    )
+    return RunSpec(
+        name="legal_contract_summarization",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]),
+        groups=["legal_contract_summarization"],
+    )
+@run_spec_function("legal_opinion_sentiment_classification")
+def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",  # noqa: E501
+    )
+    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa: E501
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Label",
+    )
+    return RunSpec(
+        name="legal_opinion_sentiment_classification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs()
+        + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
+        groups=["legal_opinion_sentiment_classification"],
+    )
+@run_spec_function("casehold")
+def get_casehold_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_scenario.CaseHOLDScenario", args={})
+    method = ADAPT_MULTIPLE_CHOICE_JOINT
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="Give a letter answer among A, B, C, D, or E.",
+        input_noun="Passage",
+        output_noun="Answer",
+        max_train_instances=2,
+    )
+    return RunSpec(
+        name="casehold",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["casehold"],
+    )
+@run_spec_function("echr_judgment_classification")
+def get_echr_judgment_classification_spec() -> RunSpec:
+    """A different implementation (binary classification) of lex_glue_fixed:subset=ecthr_a"""
+    from helm.benchmark.scenarios.echr_judgment_classification_scenario import EchrJudgeScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.echr_judgment_classification_scenario.EchrJudgeScenario",
+        args={"filter_max_length": 600},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=EchrJudgeScenario.PROMPT_INST_WITH_EX,
+        input_noun=EchrJudgeScenario.PROMPT_INPUT,
+        output_noun=EchrJudgeScenario.PROMPT_OUTPUT,
+        max_tokens=1,
+    )
+    return RunSpec(
+        name="echr_judgment_classification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
+        groups=["echr_judgment_classification"],
+    )
+# Climate
+@run_spec_function("sumosum")
+def get_sumosum_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.sumosum_scenario.SUMOSumScenario",
+        args={
+            # A too-short article could be garbage.
+            "test_filter_min_length": 100,
+            # A too-long article doesn't fit in a prompt.
+            "test_filter_max_length": 3700,
+        },
+    )
+    instructions = "Generate the title of the following article."
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Title",
+        max_train_instances=0,
+        max_tokens=100,
+        stop_sequences=["\n\n"],
+    )
+    return RunSpec(
+        name="sumosum",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]),
+        groups=["sumosum"],
+    )
+# Cyber Security
+@run_spec_function("cti_to_mitre")
+def get_cti_to_mitre_spec(num_options: int = 10, seed: int = 42, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.cti_to_mitre_scenario.CtiToMitreScenario",
+        args={
+            "num_options": num_options,
+            "seed": seed,
+        },
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="Classify the following situation by the type of security attack. Answer with only a single letter.",  # noqa:
+        input_noun="Situation",
+        output_noun="Answer",
+        max_train_instances=10,
+    )
+    return RunSpec(
+        name=f"cti_to_mitre:num_options={num_options},seed={seed},method={method}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["cti_to_mitre"],
+    )

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl