PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

helm/benchmark/run_specs/enterprise_run_specs.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Run spec functions for HELM Enterprise scenarios."""
+from typing import List
+from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import (
+    get_generation_adapter_spec,
+    get_multiple_choice_adapter_spec,
+)
+from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
+    get_exact_match_metric_specs,
+)
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+def _get_weighted_classification_metric_specs(labels: List[str]) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
+            args={"averages": ["weighted"], "scores": ["f1", "precision", "recall"], "labels": labels},
+        )
+    ]
+# Finance
+@run_spec_function("gold_commodity_news")
+def get_news_headline_spec(category: str) -> RunSpec:
+    from helm.benchmark.scenarios.gold_commodity_news_scenario import GoldCommodityNewsScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.gold_commodity_news_scenario.GoldCommodityNewsScenario",
+        args={"category": category},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=GoldCommodityNewsScenario.get_instructions(category), input_noun="Headline", output_noun="Answer"
+    )
+    return RunSpec(
+        name=f"gold_commodity_news:category={category}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
+        groups=["gold_commodity_news"],
+    )
+@run_spec_function("financial_phrasebank")
+def get_financial_phrasebank_spec(agreement: int = 50) -> RunSpec:
+    from helm.benchmark.scenarios.financial_phrasebank_scenario import FinancialPhrasebankScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.financial_phrasebank_scenario.FinancialPhrasebankScenario",
+        args={"agreement": agreement},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=FinancialPhrasebankScenario.INSTRUCTIONS,
+        input_noun="Sentence",
+        output_noun="Label",
+        max_tokens=30,
+    )
+    return RunSpec(
+        name=f"financial_phrasebank:agreement={agreement}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs()
+        + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
+        groups=["financial_phrasebank"],
+    )
+@run_spec_function("conv_fin_qa_calc")
+def get_conv_fin_qa_calc_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.conv_fin_qa_calc_scenario.ConvFinQACalcScenario", args={}
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Based on the table, answer the final question. Respond with the answer only, with no additional explanation.",  # noqa: E501
+        input_noun=None,
+        output_noun="Answer",
+    )
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.conv_fin_qa_calc_metrics.ConvFinQACalcMetric")
+    ] + get_basic_metric_specs([])
+    return RunSpec(
+        name="conv_fin_qa_calc",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["conv_fin_qa_calc"],
+    )
+# Legal
+@run_spec_function("legal_contract_summarization")
+def get_legal_contract_summarization_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario",
+        args={},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Summarize the legal document in plain English.",
+        input_noun="Document",
+        output_noun="Summary",
+        max_tokens=100,
+        stop_sequences=["\n\n"],
+    )
+    return RunSpec(
+        name="legal_contract_summarization",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]),
+        groups=["legal_contract_summarization"],
+    )
+@run_spec_function("legal_opinion_sentiment_classification")
+def get_legal_opinion_sentiment_classification_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_opinion_sentiment_classification_scenario.LegalOpinionSentimentClassificationScenario",  # noqa: E501
+    )
+    instructions = "Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative."  # noqa: E501
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Label",
+    )
+    return RunSpec(
+        name="legal_opinion_sentiment_classification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs()
+        + _get_weighted_classification_metric_specs(labels=["positive", "neutral", "negative"]),
+        groups=["legal_opinion_sentiment_classification"],
+    )
+@run_spec_function("casehold")
+def get_casehold_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.casehold_scenario.CaseHOLDScenario", args={})
+    method = ADAPT_MULTIPLE_CHOICE_JOINT
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="Give a letter answer among A, B, C, D, or E.",
+        input_noun="Passage",
+        output_noun="Answer",
+        max_train_instances=2,
+    )
+    return RunSpec(
+        name="casehold",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["casehold"],
+    )
+@run_spec_function("echr_judgment_classification")
+def get_echr_judgment_classification_spec() -> RunSpec:
+    """A different implementation (binary classification) of lex_glue_fixed:subset=ecthr_a"""
+    from helm.benchmark.scenarios.echr_judgment_classification_scenario import EchrJudgeScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.echr_judgment_classification_scenario.EchrJudgeScenario",
+        args={"filter_max_length": 600},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=EchrJudgeScenario.PROMPT_INST_WITH_EX,
+        input_noun=EchrJudgeScenario.PROMPT_INPUT,
+        output_noun=EchrJudgeScenario.PROMPT_OUTPUT,
+        max_tokens=1,
+    )
+    return RunSpec(
+        name="echr_judgment_classification",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + _get_weighted_classification_metric_specs(labels=["yes", "no"]),
+        groups=["echr_judgment_classification"],
+    )
+# Climate
+@run_spec_function("sumosum")
+def get_sumosum_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.sumosum_scenario.SUMOSumScenario",
+        args={
+            # A too-short article could be garbage.
+            "test_filter_min_length": 100,
+            # A too-long article doesn't fit in a prompt.
+            "test_filter_max_length": 3700,
+        },
+    )
+    instructions = "Generate the title of the following article."
+    adapter_spec = get_generation_adapter_spec(
+        instructions=instructions,
+        output_noun="Title",
+        max_train_instances=0,
+        max_tokens=100,
+        stop_sequences=["\n\n"],
+    )
+    return RunSpec(
+        name="sumosum",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2", "rouge_l"]),
+        groups=["sumosum"],
+    )
+# Cyber Security
+@run_spec_function("cti_to_mitre")
+def get_cti_to_mitre_spec(num_options: int = 10, seed: int = 42, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.cti_to_mitre_scenario.CtiToMitreScenario",
+        args={
+            "num_options": num_options,
+            "seed": seed,
+        },
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="Classify the following situation by the type of security attack. Answer with only a single letter.",  # noqa:
+        input_noun="Situation",
+        output_noun="Answer",
+        max_train_instances=10,
+    )
+    return RunSpec(
+        name=f"cti_to_mitre:num_options={num_options},seed={seed},method={method}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["cti_to_mitre"],
+    )

helm/benchmark/run_specs/experimental_run_specs.py CHANGED Viewed

@@ -2,10 +2,12 @@
 These run specs are not intended for use with public leaderboards."""
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
-from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
-from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
+from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
@@ -83,3 +85,110 @@ Which context makes more sense given the scenario? Please answer using either "1
         metric_specs=get_exact_match_metric_specs(),
         groups=["ewok", f"ewok_{domain}"],
     )
+@run_spec_function("autobencher_capabilities")
+def get_autobencher_capabilities_spec(subject: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.autobencher_capabilities_scenario.AutoBencherCapabilitiesScenario",
+        args={"subject": subject},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=("Output just with the final answer to the question."),
+        input_noun="Question",
+        output_noun="Answer",
+        max_tokens=100,
+    )
+    annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.autobencher_capabilities_annotator.AutoBencherCapabilitiesAnnotator"
+        )
+    ]
+    annotator_metric_spec = MetricSpec(
+        class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
+        args={
+            "annotator_name": "autobencher_capabilities",
+            "key": "score",
+        },
+    )
+    return RunSpec(
+        name="autobencher_capabilities",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=get_exact_match_metric_specs() + [annotator_metric_spec],
+        groups=["autobencher_capabilities"],
+    )
+@run_spec_function("autobencher_safety")
+def get_autobencher_safety_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.autobencher_safety_scenario.AutoBencherSafetyScenario",
+    )
+    adapter_spec = adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.autobencher_safety_annotator.AutoBencherSafetyAnnotator")
+    ]
+    annotator_metric_spec = MetricSpec(
+        class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
+        args={
+            "annotator_name": "autobencher_safety",
+            "key": "score",
+        },
+    )
+    return RunSpec(
+        name="autobencher_safety",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=get_exact_match_metric_specs() + [annotator_metric_spec],
+        groups=["autobencher_safety"],
+    )
+@run_spec_function("czech_bank_qa")
+def get_czech_bank_qa_spec(config_name: str = "berka_queries_1024_2024_12_18") -> RunSpec:
+    from helm.benchmark.scenarios.czech_bank_qa_scenario import CzechBankQAScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
+        args={"config_name": config_name},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=CzechBankQAScenario.INSTRUCTIONS,
+        input_noun="Instruction",
+        output_noun="SQL Query",
+        max_tokens=512,
+        stop_sequences=["\n\n"],
+    )
+    return RunSpec(
+        name="czech_bank_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs([])
+        + [MetricSpec(class_name="helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics", args={})],
+        annotators=[AnnotatorSpec("helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator")],
+        groups=["czech_bank_qa"],
+    )

helm/benchmark/run_specs/finance_run_specs.py CHANGED Viewed

@@ -89,10 +89,14 @@ def get_banking77_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.banking77_scenario.Banking77Scenario", args={})
-    # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77
+    # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77,
+    # with a slight modification to the instruction prompt for instruction-following models.
     scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), Banking77Scenario.name)
+    instructions = get_raft_instructions("banking_77", scenario_cache_path).replace(
+        "\n", " Answer with only the label for the last query.\n", 1
+    )
     adapter_spec = get_generation_adapter_spec(
-        instructions=get_raft_instructions("banking_77", scenario_cache_path),
+        instructions=instructions,
         input_noun=None,
         output_noun="Label",
         max_tokens=30,  # at most ~50 characters per label

helm/benchmark/run_specs/imdb_ptbr_run_specs.py ADDED Viewed

@@ -0,0 +1,30 @@
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("imdb_ptbr")
+def get_tweetsentbr_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.imdb_ptbr_scenario.IMDB_PTBRScenario", args={})
+    adapter_spec = get_generation_adapter_spec(
+        instructions="""Classifique a resenha do usuário sobre o filme como "positivo" ou "negativo".
+        Resenha: Tudo sobre o filme é maravilhoso. Atuações, trilha sonora, fotografia. Amei tudo!
+        Classe: positivo
+        Resenha: Achei um filme bem fraco, não gostei da história.
+        Classe: negativo
+        """,
+        input_noun="Resenha",
+        output_noun="Classe",
+    )
+    return RunSpec(
+        name="imdb_ptbr",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        groups=["imdb_ptbr"],
+    )

helm/benchmark/run_specs/lite_run_specs.py CHANGED Viewed

@@ -166,8 +166,8 @@ def get_math_spec(
     use_chain_of_thought: str = "False",
 ) -> RunSpec:
     # Convert to bools and remove the str versions
-    use_official_examples_bool: bool = use_official_examples == "True"
-    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    use_official_examples_bool: bool = use_official_examples.lower() == "true"
+    use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
     del use_official_examples
     del use_chain_of_thought

helm/benchmark/run_specs/long_context_run_specs.py ADDED Viewed

@@ -0,0 +1,89 @@
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_open_ended_generation_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        reference_prefix="A. ",
+        reference_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        temperature=0.0,
+        max_tokens=max_tokens,
+        stop_sequences=[],
+    )
+@run_spec_function("ruler_hotpotqa")
+def get_ruler_hotpotqa_spec(max_num_words: int = 65536) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERHotpotQAScenario",
+        args={
+            "max_num_words": max_num_words,
+        },
+    )
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
+    return RunSpec(
+        name=f"ruler_hotpotqa:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs(),
+        groups=["ruler_hotpotqa"],
+    )
+@run_spec_function("ruler_squad")
+def get_ruler_squad_spec(max_num_words: int = 65536) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERSQuADScenario",
+        args={
+            "max_num_words": max_num_words,
+        },
+    )
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
+    return RunSpec(
+        name=f"ruler_squad:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs(),
+        groups=["ruler_squad"],
+    )
+@run_spec_function("infinite_bench_sum")
+def get_infinite_bench_sum_spec(min_num_words: int = 0, max_num_words: int = 65536) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.infinite_bench_sum_scenario.InfiniteBenchSumScenario",
+        args={
+            "min_num_words": min_num_words,
+            "max_num_words": max_num_words,
+        },
+    )
+    # No official number for max tokens, the average output token is 1.1k according to the paper
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=2000)
+    metric_specs = get_basic_metric_specs(["rouge_l"])
+    return RunSpec(
+        name=f"infinite_bench_sum:min_num_words={min_num_words},max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["infinite_bench_sum"],
+    )

crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl