PyPI - crfm-helm - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend - Supply Chain Defender

crfm-helm 0.5.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (209) hide show

helm/benchmark/run_expander.py CHANGED Viewed

@@ -10,6 +10,7 @@ from helm.benchmark.model_metadata_registry import (
     get_all_text_models,
     get_model_metadata,
     get_model_names_with_tag,
+    DEPRECATED_MODEL_TAG,
     FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
     LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
     ABLATION_MODEL_TAG,
@@ -343,16 +344,6 @@ class AnthropicClaude3RunExpander(RunExpander):
             run_spec,
             adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
         )
-        if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
-            instructions = "Answer with only a single letter."
-            if run_spec.adapter_spec.instructions:
-                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
-            return [
-                replace(
-                    run_spec,
-                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
-                ),
-            ]
         return [run_spec]
@@ -610,6 +601,12 @@ class ModelRunExpander(ReplaceValueRunExpander):
                 values_dict["ablation"] = models
             else:
                 values_dict[family_name] = models
+        # For each of the keys above, filter out deprecated models.
+        deprecated_models = set(get_model_names_with_tag(DEPRECATED_MODEL_TAG))
+        for family_name in values_dict.keys():
+            values_dict[family_name] = [model for model in values_dict[family_name] if model not in deprecated_models]
         return values_dict
@@ -1274,6 +1271,30 @@ class IncreaseMaxTokensRunExpander(RunExpander):
         ]
+class TemperatureRunExpander(RunExpander):
+    """
+    Run expander for setting the temperature.
+    """
+    name = "temperature"
+    def __init__(self, value: float):
+        """
+        Args:
+            value (float): The amount to set temperature to
+        """
+        self.value = value
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        adapter_spec = replace(run_spec.adapter_spec, temperature=self.value)
+        return [
+            replace(
+                run_spec,
+                adapter_spec=adapter_spec,
+            ),
+        ]
 class IncreaseTemperatureRunExpander(RunExpander):
     """
     Run expander for increasing the temperature.
@@ -1402,23 +1423,26 @@ class OutputFormatInstructions(RunExpander):
     name = "output_format_instructions"
+    _SUFFIX_SUFFIX = "_suffix"
     def __init__(self, scenario: str):
-        self.scenario = scenario
+        if scenario.endswith(OutputFormatInstructions._SUFFIX_SUFFIX):
+            self.scenario = scenario[: -len(OutputFormatInstructions._SUFFIX_SUFFIX)]
+            self.suffix = True
+        else:
+            self.scenario = scenario
+            self.suffix = False
     def expand(self, run_spec: RunSpec) -> List[RunSpec]:
         if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
             if self.scenario == "mmlu_only_last_question":
                 instructions = "Answer only the last question with only a single letter."
+            elif self.scenario == "mmlu":
+                instructions = "Answer with only a single letter."
+            elif self.scenario == "mcqa":
+                instructions = "Answer with only a single letter."
             else:
                 instructions = "Answer with only a single letter."
-            if run_spec.adapter_spec.instructions:
-                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
-            return [
-                replace(
-                    run_spec,
-                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
-                ),
-            ]
         elif run_spec.adapter_spec.method == ADAPT_GENERATION:
             output_noun = run_spec.adapter_spec.output_prefix.split(":")[0]
             if self.scenario == "narrative_qa":
@@ -1433,27 +1457,53 @@ class OutputFormatInstructions(RunExpander):
                     instructions = f"Answer with the {output_noun.lower()}."
                 else:
                     instructions = "Answer yes or no."
+            elif self.scenario == "legalbench_abercrombie":
+                instructions = "Answer with only 'generic', 'descriptive', 'suggestive', 'arbitrary' or 'fanciful'."
+            elif self.scenario == "legalbench_function_of_decision_section":
+                instructions = "Answer with only 'Facts', 'Procedural History', 'Issue', 'Rule', 'Analysis', 'Conclusion' or 'Decree'."  # noqa: E501
+            elif self.scenario == "legalbench_yes_or_no":
+                instructions = "Answer with only 'Yes' or 'No'."
             elif self.scenario == "wmt_14":
                 instructions = "Answer with the English translation."
-            else:
-                raise ValueError(f"Unknown scenario {self.scenario}")
-            if run_spec.adapter_spec.output_prefix:
+            elif self.scenario == "wmt_14_only_last_sentence":
+                instructions = "Answer with only the English translation for the last sentence."
+            elif self.scenario == "math":
+                instructions = "Wrap the final answer with the \\boxed{} command."
+            elif self.scenario == "numeric_nlg":
+                instructions = "Answer with only description of the last table as a single paragraph on a single line."
+            elif self.scenario == "tab_fact":
                 instructions = (
-                    f"{instructions} Do not include '{run_spec.adapter_spec.output_prefix.strip()}' in your answer."
+                    "Answer with only the classification of the last statement, either 'refuted' or 'entailed'."
+                )
+            elif self.scenario == "wikitq":
+                instructions = (
+                    "Answer only the last question with a short answer. "
+                    "Avoid extra, unnecessary information in the answer."
                 )
-            if run_spec.adapter_spec.instructions:
-                instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
             else:
-                instructions = f"{instructions}\n"
+                raise ValueError(f"Unknown scenario {self.scenario}")
+        if self.suffix:
             return [
                 replace(
                     run_spec,
-                    adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+                    adapter_spec=replace(
+                        run_spec.adapter_spec,
+                        global_suffix=f"{run_spec.adapter_spec.global_suffix}\n\n{instructions}",
+                    ),
                 ),
             ]
-        raise ValueError(f"Unknown scenario {self.scenario}")
+        if run_spec.adapter_spec.instructions:
+            instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
+        else:
+            instructions = f"{instructions}\n"
+        return [
+            replace(
+                run_spec,
+                adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
+            ),
+        ]
 RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
@@ -1479,6 +1529,9 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
     ChatMLRunExpander,
     EvalSplitRunExpander,
     OutputFormatInstructions,
+    TemperatureRunExpander,
+    IncreaseTemperatureRunExpander,
+    IncreaseMaxTokensRunExpander,
 ]