PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/run_specs/experimental_run_specs.py CHANGED Viewed

@@ -2,10 +2,12 @@
 These run specs are not intended for use with public leaderboards."""
-from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
 from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_MULTIPLE_CHOICE_JOINT
-from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
-from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs
+from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec, get_generation_adapter_spec
+from helm.benchmark.annotation.annotator import AnnotatorSpec
+from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_exact_match_metric_specs
+from helm.benchmark.metrics.metric import MetricSpec
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.scenarios.scenario import ScenarioSpec
@@ -83,3 +85,110 @@ Which context makes more sense given the scenario? Please answer using either "1
         metric_specs=get_exact_match_metric_specs(),
         groups=["ewok", f"ewok_{domain}"],
     )
+@run_spec_function("autobencher_capabilities")
+def get_autobencher_capabilities_spec(subject: str) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.autobencher_capabilities_scenario.AutoBencherCapabilitiesScenario",
+        args={"subject": subject},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=("Output just with the final answer to the question."),
+        input_noun="Question",
+        output_noun="Answer",
+        max_tokens=100,
+    )
+    annotator_specs = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.autobencher_capabilities_annotator.AutoBencherCapabilitiesAnnotator"
+        )
+    ]
+    annotator_metric_spec = MetricSpec(
+        class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
+        args={
+            "annotator_name": "autobencher_capabilities",
+            "key": "score",
+        },
+    )
+    return RunSpec(
+        name="autobencher_capabilities",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=get_exact_match_metric_specs() + [annotator_metric_spec],
+        groups=["autobencher_capabilities"],
+    )
+@run_spec_function("autobencher_safety")
+def get_autobencher_safety_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.autobencher_safety_scenario.AutoBencherSafetyScenario",
+    )
+    adapter_spec = adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    annotator_specs = [
+        AnnotatorSpec(class_name="helm.benchmark.annotation.autobencher_safety_annotator.AutoBencherSafetyAnnotator")
+    ]
+    annotator_metric_spec = MetricSpec(
+        class_name="helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
+        args={
+            "annotator_name": "autobencher_safety",
+            "key": "score",
+        },
+    )
+    return RunSpec(
+        name="autobencher_safety",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        annotators=annotator_specs,
+        metric_specs=get_exact_match_metric_specs() + [annotator_metric_spec],
+        groups=["autobencher_safety"],
+    )
+@run_spec_function("czech_bank_qa")
+def get_czech_bank_qa_spec(config_name: str = "berka_queries_1024_2024_12_18") -> RunSpec:
+    from helm.benchmark.scenarios.czech_bank_qa_scenario import CzechBankQAScenario
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
+        args={"config_name": config_name},
+    )
+    adapter_spec = get_generation_adapter_spec(
+        instructions=CzechBankQAScenario.INSTRUCTIONS,
+        input_noun="Instruction",
+        output_noun="SQL Query",
+        max_tokens=512,
+        stop_sequences=["\n\n"],
+    )
+    return RunSpec(
+        name="czech_bank_qa",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs([])
+        + [MetricSpec(class_name="helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics", args={})],
+        annotators=[AnnotatorSpec("helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator")],
+        groups=["czech_bank_qa"],
+    )

helm/benchmark/run_specs/imdb_ptbr_run_specs.py ADDED Viewed

@@ -0,0 +1,30 @@
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+@run_spec_function("imdb_ptbr")
+def get_tweetsentbr_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.imdb_ptbr_scenario.IMDB_PTBRScenario", args={})
+    adapter_spec = get_generation_adapter_spec(
+        instructions="""Classifique a resenha do usuário sobre o filme como "positivo" ou "negativo".
+        Resenha: Tudo sobre o filme é maravilhoso. Atuações, trilha sonora, fotografia. Amei tudo!
+        Classe: positivo
+        Resenha: Achei um filme bem fraco, não gostei da história.
+        Classe: negativo
+        """,
+        input_noun="Resenha",
+        output_noun="Classe",
+    )
+    return RunSpec(
+        name="imdb_ptbr",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        groups=["imdb_ptbr"],
+    )

helm/benchmark/run_specs/lite_run_specs.py CHANGED Viewed

@@ -166,8 +166,8 @@ def get_math_spec(
     use_chain_of_thought: str = "False",
 ) -> RunSpec:
     # Convert to bools and remove the str versions
-    use_official_examples_bool: bool = use_official_examples == "True"
-    use_chain_of_thought_bool: bool = use_chain_of_thought == "True"
+    use_official_examples_bool: bool = use_official_examples.lower() == "true"
+    use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
     del use_official_examples
     del use_chain_of_thought

helm/benchmark/run_specs/long_context_run_specs.py ADDED Viewed

@@ -0,0 +1,89 @@
+from helm.benchmark.adaptation.adapter_spec import ADAPT_GENERATION, AdapterSpec
+from helm.benchmark.metrics.common_metric_specs import get_basic_metric_specs, get_open_ended_generation_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+def _get_long_context_generation_adapter_spec(max_tokens: int) -> AdapterSpec:
+    return AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        reference_prefix="A. ",
+        reference_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        temperature=0.0,
+        max_tokens=max_tokens,
+        stop_sequences=[],
+    )
+@run_spec_function("ruler_hotpotqa")
+def get_ruler_hotpotqa_spec(max_num_words: int = 65536) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERHotpotQAScenario",
+        args={
+            "max_num_words": max_num_words,
+        },
+    )
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
+    return RunSpec(
+        name=f"ruler_hotpotqa:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs(),
+        groups=["ruler_hotpotqa"],
+    )
+@run_spec_function("ruler_squad")
+def get_ruler_squad_spec(max_num_words: int = 65536) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.ruler_qa_scenarios.RULERSQuADScenario",
+        args={
+            "max_num_words": max_num_words,
+        },
+    )
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=100)
+    return RunSpec(
+        name=f"ruler_squad:max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_open_ended_generation_metric_specs(),
+        groups=["ruler_squad"],
+    )
+@run_spec_function("infinite_bench_sum")
+def get_infinite_bench_sum_spec(min_num_words: int = 0, max_num_words: int = 65536) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.infinite_bench_sum_scenario.InfiniteBenchSumScenario",
+        args={
+            "min_num_words": min_num_words,
+            "max_num_words": max_num_words,
+        },
+    )
+    # No official number for max tokens, the average output token is 1.1k according to the paper
+    adapter_spec = _get_long_context_generation_adapter_spec(max_tokens=2000)
+    metric_specs = get_basic_metric_specs(["rouge_l"])
+    return RunSpec(
+        name=f"infinite_bench_sum:min_num_words={min_num_words},max_num_words={max_num_words}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=["infinite_bench_sum"],
+    )

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl