PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Run spec functions for three clinical sections of MMLU human-translated into 11 African languages
+Available subjects: "clinical_knowledge", "college_medicine", "virology"
+Available langs: "af", "zu", "xh", "am", "bm", "ig", "nso", "sn", "st", "tn", "ts" (see lang_map below for language code mapping to language name, or here for ISO code reference: https://huggingface.co/languages)
+"""  # noqa: E501
+from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("mmlu_clinical_afr")
+def get_mmlu_clinical_afr_spec(subject: str, lang: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mmlu_clinical_afr_scenario.MMLU_Clinical_Afr_Scenario",
+        args={"subject": subject, "lang": lang},
+    )
+    lang_map = {
+        "af": "Afrikaans",
+        "zu": "Zulu",
+        "xh": "Xhosa",
+        "am": "Amharic",
+        "bm": "Bambara",
+        "ig": "Igbo",
+        "nso": "Sepedi",
+        "sn": "Shona",
+        "st": "Sesotho",
+        "tn": "Setswana",
+        "ts": "Tsonga",
+    }
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')} "
+        f"in {lang_map[lang]}.",
+        input_noun="Question",
+        output_noun="Answer",
+    )
+    return RunSpec(
+        name=f"mmlu_clinical_afr:subject={subject},lang={lang},method={method}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=[f"mmlu_clinical_afr_{subject}", f"mmlu_clinical_afr_{subject}_{lang}"],
+    )

helm/benchmark/run_specs/oab_exams_specs.py ADDED Viewed

@@ -0,0 +1,32 @@
+from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("oab_exams")
+def get_enem_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.oab_exams_scenario.OABExamsScenario", args={})
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=ADAPT_MULTIPLE_CHOICE_JOINT,
+        instructions="Dê uma resposta selecionando uma letra entre as opções fornecidas. "
+        "Se as opções forem A, B, C e D,"
+        "sua resposta deve consistir em uma única letra que corresponde a resposta correta.\n"
+        "Exemplo: Ao conselho da subseção compete\nA. representar a OAB no Conselho de Segurança do MERCOSUL."
+        "\nB. fiscalizar as funções e atribuições do conselho seccional.\nC. instaurar e instruir processos "
+        "disciplinares, para julgamento pelo Conselho Federal.\nD. receber pedido de inscrição nos quadros de "
+        "advogado e estagiário, instruindo e emitindo parecer prévio, para decisão do conselho seccional.\n"
+        "Resposta: D",
+        input_noun="Pergunta",
+        output_noun="Resposta",
+    )
+    return RunSpec(
+        name="oab_exams",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs(),
+        groups=["oab_exams"],
+    )

helm/benchmark/run_specs/safety_run_specs.py CHANGED Viewed

@@ -41,6 +41,43 @@ def get_harm_bench_spec() -> RunSpec:
     )
+@run_spec_function("harm_bench_gcg_transfer")
+def get_harm_bench_gcg_transfer_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.harm_bench_gcg_transfer_scenario.HarmBenchGCGTransferScenario"
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="harm_bench_gcg_transfer",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["harm_bench_gcg_transfer"],
+    )
 @run_spec_function("simple_safety_tests")
 def get_simple_safety_tests_spec() -> RunSpec:
     adapter_spec = AdapterSpec(

helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} RENAMED Viewed

@@ -5,9 +5,9 @@ from helm.benchmark.adaptation.common_adapter_specs import (
     get_generation_adapter_spec,
     get_multiple_choice_separate_adapter_spec,
 )
-from helm.benchmark.metrics.bhasa_metrics_specs import (
-    get_bhasa_machine_translation_metric_specs,
-    get_bhasa_qa_metric_specs,
+from helm.benchmark.metrics.seahelm_metrics_specs import (
+    get_seahelm_machine_translation_metric_specs,
+    get_seahelm_qa_metric_specs,
 )
 from helm.benchmark.metrics.common_metric_specs import (
     get_basic_metric_specs,
@@ -17,7 +17,7 @@ from helm.benchmark.metrics.common_metric_specs import (
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
-# BHASA Run Specs
+# SEA-HELM Run Specs
 #   A. Natural Language Understanding
 #   B. Natural Language Generation
 #   C. Natural Language Reasoning
@@ -43,18 +43,18 @@ def get_tydiqa_spec() -> RunSpec:
         max_tokens=256,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.TyDiQAScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.TyDiQAScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_bhasa_qa_metric_specs(
+        metric_specs=get_seahelm_qa_metric_specs(
             args={
                 "language": "id",
             }
         ),
-        groups=["bhasa_nlu", "tydiqa"],
+        groups=["seahelm_nlu", "tydiqa"],
     )
@@ -84,7 +84,7 @@ def get_xquad_spec(language="th") -> RunSpec:
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.XQuADScenario",
+        class_name="helm.benchmark.scenarios.seahelm_scenario.XQuADScenario",
         args={
             "language": language,
         },
@@ -94,12 +94,12 @@ def get_xquad_spec(language="th") -> RunSpec:
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_bhasa_qa_metric_specs(
+        metric_specs=get_seahelm_qa_metric_specs(
             args={
                 "language": language,
             }
         ),
-        groups=["bhasa_nlu", f"xquad_{language}"],
+        groups=["seahelm_nlu", f"xquad_{language}"],
     )
@@ -116,18 +116,18 @@ def get_indicqa_spec() -> RunSpec:
         max_tokens=256,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicQAScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicQAScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_bhasa_qa_metric_specs(
+        metric_specs=get_seahelm_qa_metric_specs(
             args={
                 "language": "ta",
             }
         ),
-        groups=["bhasa_nlu", "indicqa"],
+        groups=["seahelm_nlu", "indicqa"],
     )
@@ -146,14 +146,14 @@ def get_nusax_spec() -> RunSpec:
         max_tokens=16,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.NusaXScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.NusaXScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlu", "nusax"],
+        groups=["seahelm_nlu", "nusax"],
     )
@@ -171,14 +171,14 @@ def get_uitvsfc_spec() -> RunSpec:
         max_tokens=16,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.UITVSFCScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.UITVSFCScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlu", "uitvsfc"],
+        groups=["seahelm_nlu", "uitvsfc"],
     )
@@ -196,14 +196,14 @@ def get_wisesight_spec() -> RunSpec:
         max_tokens=16,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.WisesightScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.WisesightScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlu", "wisesight"],
+        groups=["seahelm_nlu", "wisesight"],
     )
@@ -221,14 +221,14 @@ def get_indicsentiment_spec() -> RunSpec:
         max_tokens=16,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicSentimentScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicSentimentScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_classification_metric_specs() + get_basic_metric_specs([]),
-        groups=["bhasa_nlu", "indicsentiment"],
+        groups=["seahelm_nlu", "indicsentiment"],
     )
@@ -250,14 +250,14 @@ def get_mlhsd_spec() -> RunSpec:
         max_tokens=16,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.MLHSDScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.MLHSDScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlu", "mlhsd"],
+        groups=["seahelm_nlu", "mlhsd"],
     )
@@ -278,14 +278,14 @@ def get_vihsd_spec() -> RunSpec:
         max_tokens=16,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ViHSDScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.ViHSDScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlu", "vihsd"],
+        groups=["seahelm_nlu", "vihsd"],
     )
@@ -304,14 +304,14 @@ def get_thaitoxicitytweets_spec() -> RunSpec:
         max_tokens=16,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.ThaiToxicityTweetsScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.ThaiToxicityTweetsScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlu", "thaitoxicitytweets"],
+        groups=["seahelm_nlu", "thaitoxicitytweets"],
     )
@@ -378,7 +378,7 @@ def get_flores_spec(source="en", target="id") -> RunSpec:
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.FloresScenario",
+        class_name="helm.benchmark.scenarios.seahelm_scenario.FloresScenario",
         args={
             "pair": pair,
         },
@@ -388,8 +388,8 @@ def get_flores_spec(source="en", target="id") -> RunSpec:
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_bhasa_machine_translation_metric_specs(),
-        groups=["bhasa_nlg", f"flores_{pair}"],
+        metric_specs=get_seahelm_machine_translation_metric_specs(),
+        groups=["seahelm_nlg", f"flores_{pair}"],
     )
@@ -414,14 +414,14 @@ def get_indonli_spec() -> RunSpec:
         max_tokens=2,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndoNLIScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndoNLIScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlr", "indonli"],
+        groups=["seahelm_nlr", "indonli"],
     )
@@ -456,7 +456,7 @@ def get_xnli_spec(language="vi") -> RunSpec:
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.XNLIScenario",
+        class_name="helm.benchmark.scenarios.seahelm_scenario.XNLIScenario",
         args={
             "language": language,
         },
@@ -467,7 +467,7 @@ def get_xnli_spec(language="vi") -> RunSpec:
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlr", f"xnli_{language}"],
+        groups=["seahelm_nlr", f"xnli_{language}"],
     )
@@ -487,14 +487,14 @@ def get_indicxnli_spec() -> RunSpec:
         max_tokens=2,
     )
-    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bhasa_scenario.IndicXNLIScenario")
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.seahelm_scenario.IndicXNLIScenario")
     return RunSpec(
         name=name,
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlr", "indicxnli"],
+        groups=["seahelm_nlr", "indicxnli"],
     )
@@ -531,7 +531,7 @@ def get_xcopa_spec(language="id") -> RunSpec:
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.XCOPAScenario",
+        class_name="helm.benchmark.scenarios.seahelm_scenario.XCOPAScenario",
         args={
             "language": language,
         },
@@ -542,7 +542,7 @@ def get_xcopa_spec(language="id") -> RunSpec:
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
-        groups=["bhasa_nlr", f"xcopa_{language}"],
+        groups=["seahelm_nlr", f"xcopa_{language}"],
     )
@@ -566,7 +566,7 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
         )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEASyntaxMinimalPairsScenario",
+        class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEASyntaxMinimalPairsScenario",
         args={
             "method": method,
             "language": language,
@@ -578,14 +578,18 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["bhasa_linguistic", f"lindsea_syntax_minimal_pairs_{language}"],
+        groups=[
+            "seahelm_linguistic",
+            f"lindsea_syntax_minimal_pairs_{language}",
+            f"lindsea_syntax_minimal_pairs_{method}_{language}",
+        ],
     )
-# 2.1. Pragmatics: LINDSEA Pragmatic Reasoning (single sentence)
-@run_spec_function("lindsea_pragmatics_pragmatic_reasoning_single")
-def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> RunSpec:
-    name = f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"
+# 2.1. Pragmatics: LINDSEA Presuppositions
+@run_spec_function("lindsea_pragmatics_presuppositions")
+def get_lindsea_pragmatics_presuppositions_spec(language: str = "id", subset: str = "all") -> RunSpec:
+    name = f"lindsea_pragmatics_presuppositions_{subset}_{language}"
     adapter_spec = get_generation_adapter_spec(
         output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -595,9 +599,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningSingleScenario",
+        class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEAPragmaticsPresuppositionsScenario",
         args={
             "language": language,
+            "subset": subset,
         },
     )
@@ -606,14 +611,18 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"],
+        groups=[
+            "seahelm_linguistic",
+            f"lindsea_pragmatics_presuppositions_{language}",
+            f"lindsea_pragmatics_presuppositions_{subset}_{language}",
+        ],
     )
-# 2.2. Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
-@run_spec_function("lindsea_pragmatics_pragmatic_reasoning_pair")
-def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSpec:
-    name = f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"
+# 2.2. Pragmatics: LINDSEA Scalar Implicatures
+@run_spec_function("lindsea_pragmatics_scalar_implicatures")
+def get_lindsea_pragmatics_scalar_implicatures_spec(language: str = "id", subset: str = "all") -> RunSpec:
+    name = f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}"
     adapter_spec = get_generation_adapter_spec(
         output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -623,9 +632,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningPairScenario",
+        class_name="helm.benchmark.scenarios.seahelm_scenario.LINDSEAPragmaticsScalarImplicaturesScenario",
         args={
             "language": language,
+            "subset": subset,
         },
     )
@@ -634,5 +644,9 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"],
+        groups=[
+            "seahelm_linguistic",
+            f"lindsea_pragmatics_scalar_implicatures_{language}",
+            f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}",
+        ],
     )

helm/benchmark/run_specs/sql_run_specs.py ADDED Viewed

@@ -0,0 +1,54 @@
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.metrics.metric import MetricSpec
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("bird_sql")
+def get_bird_sql_dev_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario")
+    adapter_spec = get_generation_adapter_spec(
+        input_noun=None,
+        output_noun=None,
+        max_tokens=1024,
+        stop_sequences=[],
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator")]
+    return RunSpec(
+        name="bird_sql",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=get_exact_match_metric_specs()
+        + [MetricSpec(class_name="helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric")],
+        groups=["bird_sql"],
+    )
+@run_spec_function("spider")
+def get_spider_run_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.spider_scenario.SpiderScenario")
+    adapter_spec = get_generation_adapter_spec(
+        input_noun=None,
+        output_noun=None,
+        max_tokens=1024,
+        stop_sequences=[],
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.spider_annotator.SpiderAnnotator")]
+    return RunSpec(
+        name="spider",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=get_exact_match_metric_specs()
+        + [MetricSpec(class_name="helm.benchmark.metrics.spider_metrics.SpiderMetric")],
+        groups=["spider"],
+    )

helm/benchmark/run_specs/tweetsentbr_run_specs.py ADDED Viewed

@@ -0,0 +1,32 @@
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("tweetsentbr")
+def get_tweetsentbr_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.tweetsentbr_scenario.TweetSentBRScenario", args={}
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions="""Classifique o tweet como "Positivo", "Neutro" ou "Negativo".
+        Tweet: vocês viram a novela hoje?
+        Classe: Neutro
+        Tweet: que vontade de comer pizza
+        Classe: Neutro
+        """,
+        input_noun="Tweet",
+        output_noun="Classe",
+    )
+    return RunSpec(
+        name="tweetsentbr",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        groups=["tweetsentbr"],
+    )

helm/benchmark/run_specs/unitxt_run_specs.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import os
 from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
 from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs
@@ -8,9 +10,15 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec
 @run_spec_function("unitxt")
 def get_unitxt_spec(**kwargs) -> RunSpec:
     card = kwargs.get("card")
-    if not card:
-        raise Exception("Unitxt card must be specified")
-    name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()])
+    recipe = kwargs.get("recipe")
+    if not card and not recipe:
+        raise Exception("Unitxt card or recipe must be specified")
+    if os.environ.get("HELM_UNITXT_SHORTEN_RUN_SPEC_NAMES", "").lower() == "true":
+        name_suffix = ",".join(
+            [f"{key}={value}" for key, value in kwargs.items() if key not in ["template_card_index", "loader_limit"]]
+        )
+    else:
+        name_suffix = ",".join([f"{key}={value}" for key, value in kwargs.items()])
     name = f"unitxt:{name_suffix}"
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.unitxt_scenario.UnitxtScenario", args=kwargs)
     adapter_spec = AdapterSpec(
@@ -28,7 +36,8 @@ def get_unitxt_spec(**kwargs) -> RunSpec:
         max_train_instances=0,
         num_outputs=1,
         temperature=0.0,
-        stop_sequences=["\n\n"],
+        max_tokens=512,
+        stop_sequences=[],
     )
     return RunSpec(
         name=name,
@@ -38,5 +47,5 @@ def get_unitxt_spec(**kwargs) -> RunSpec:
             MetricSpec(class_name="helm.benchmark.metrics.unitxt_metrics.UnitxtMetric", args=kwargs),
         ]
         + get_basic_metric_specs([]),
-        groups=[f"unitxt_{card}"],
+        groups=[f"unitxt_{card or recipe}"],
     )

crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl