PyPI - crfm-helm - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

crfm-helm 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/METADATA +10 -8
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/RECORD +50 -37
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/WHEEL +1 -1
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/entry_points.txt +1 -0
helm/benchmark/__init__.py +2 -0
helm/benchmark/adaptation/adapter_spec.py +3 -0
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -7
helm/benchmark/contamination/__init__.py +0 -0
helm/benchmark/metrics/classification_metrics.py +28 -23
helm/benchmark/metrics/test_classification_metrics.py +44 -9
helm/benchmark/presentation/create_plots.py +617 -0
helm/benchmark/presentation/summarize.py +4 -2
helm/benchmark/presentation/test_create_plots.py +32 -0
helm/benchmark/run.py +23 -1
helm/benchmark/run_expander.py +161 -47
helm/benchmark/run_specs.py +84 -10
helm/benchmark/runner.py +31 -3
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/imdb_listdir.json +50014 -0
helm/benchmark/scenarios/lex_glue_scenario.py +58 -17
helm/benchmark/scenarios/lextreme_scenario.py +37 -25
helm/benchmark/scenarios/opinions_qa_scenario.py +194 -0
helm/benchmark/scenarios/scenario.py +5 -0
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/static/benchmarking.css +14 -0
helm/benchmark/static/benchmarking.js +43 -0
helm/benchmark/static/index.html +2 -0
helm/benchmark/static/json-urls.js +4 -0
helm/benchmark/static/plot-captions.js +16 -0
helm/benchmark/static/schema.yaml +66 -8
helm/benchmark/window_services/cohere_window_service.py +20 -0
helm/benchmark/window_services/flan_t5_window_service.py +29 -0
helm/benchmark/window_services/huggingface_window_service.py +39 -0
helm/benchmark/window_services/test_flan_t5_window_service.py +12 -0
helm/benchmark/window_services/wider_ai21_window_service.py +13 -0
helm/benchmark/window_services/window_service_factory.py +27 -6
helm/common/general.py +12 -5
helm/proxy/clients/aleph_alpha_client.py +47 -28
helm/proxy/clients/auto_client.py +28 -24
helm/proxy/clients/huggingface_client.py +30 -17
helm/proxy/clients/huggingface_model_registry.py +111 -0
helm/proxy/clients/huggingface_tokenizer.py +23 -7
helm/proxy/clients/openai_client.py +60 -2
helm/proxy/clients/test_huggingface_model_registry.py +57 -0
helm/proxy/clients/together_client.py +17 -2
helm/proxy/clients/yalm_tokenizer/voc_100b.sp +0 -0
helm/proxy/clients/yalm_tokenizer/yalm_tokenizer.py +8 -2
helm/proxy/models.py +82 -2
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/LICENSE +0 -0
{crfm_helm-0.2.1.dist-info → crfm_helm-0.2.2.dist-info}/top_level.txt +0 -0

helm/benchmark/run.py CHANGED Viewed

@@ -7,6 +7,7 @@ from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
 from helm.common.hierarchical_logger import hlog, htrack, htrack_block
 from helm.common.authentication import Authentication
 from helm.common.object_spec import parse_object_spec
+from helm.proxy.clients.huggingface_model_registry import register_huggingface_model_config
 from helm.proxy.services.remote_service import create_authentication, add_service_args
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
@@ -73,6 +74,8 @@ def run_benchmarking(
     suite: str,
     dry_run: bool,
     skip_instances: bool,
+    skip_completed_runs: bool,
+    exit_on_error: bool,
     mongo_uri: str = "",
 ) -> List[RunSpec]:
     """Runs RunSpecs given a list of RunSpec descriptions."""
@@ -89,7 +92,7 @@ def run_benchmarking(
         for run_spec in run_specs:
             hlog(run_spec)
-    runner = Runner(execution_spec, output_path, suite, skip_instances)
+    runner = Runner(execution_spec, output_path, suite, skip_instances, skip_completed_runs, exit_on_error)
     runner.run_all(run_specs)
     return run_specs
@@ -197,6 +200,12 @@ def main():
         default=None,
         help="Fail and exit immediately if a particular RunSpec fails.",
     )
+    parser.add_argument(
+        "--skip-completed-runs",
+        action="store_true",
+        default=None,
+        help="Skip RunSpecs that have completed i.e. output files exists.",
+    )
     parser.add_argument(
         "--priority",
         type=int,
@@ -205,9 +214,20 @@ def main():
         "If a value for --priority is not specified, run on everything",
     )
     parser.add_argument("-r", "--run-specs", nargs="*", help="Specifies what to run", default=[])
+    parser.add_argument(
+        "--enable-huggingface-models",
+        nargs="+",
+        default=[],
+        help="Experimental: Enable using AutoModelForCausalLM models from Hugging Face Model Hub. "
+        "Format: namespace/model_name[@revision]",
+    )
     add_run_args(parser)
     args = parser.parse_args()
     validate_args(args)
+    for huggingface_model_name in args.enable_huggingface_models:
+        register_huggingface_model_config(huggingface_model_name)
     run_entries: List[RunEntry] = []
     if args.conf_paths:
         run_entries.extend(read_run_entries(args.conf_paths).entries)
@@ -242,6 +262,8 @@ def main():
         suite=args.suite,
         dry_run=args.dry_run,
         skip_instances=args.skip_instances,
+        skip_completed_runs=args.skip_completed_runs,
+        exit_on_error=args.exit_on_error,
         mongo_uri=args.mongo_uri,
     )

helm/benchmark/run_expander.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import replace
-from typing import List, Dict, Optional, Tuple
+from typing import List, Dict, Optional, Tuple, Type
 from helm.proxy.models import (
     get_all_code_models,
@@ -302,36 +302,58 @@ class ModelRunExpander(ReplaceValueRunExpander):
     """
     name = "model"
-    values_dict = {
-        "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
-        "ai21/j1-jumbo": ["ai21/j1-jumbo"],
-        "openai/curie": ["openai/curie"],
-        "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"],  # Compare ChatGPT to text-davinci-003
-        "all": get_all_models(),
-        "text_code": get_all_text_models() + get_all_code_models(),
-        "text": get_all_text_models(),
-        "code": get_all_code_models(),
-        "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
-        "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
-        "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
-        "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
-        "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
-        "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
-        "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
-        "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
-    }
-    # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
-    # which contains the subset of models with the ablation tag.
-    ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
-    ablation_values_dict = {}
-    for family_name, models in values_dict.items():
-        ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
-    for family_name, models in ablation_values_dict.items():
-        if family_name == "ablation_all":
-            values_dict["ablation"] = models
+    def __init__(self, value):
+        """
+        `value` is either the actual value to use or a lookup into the values dict.
+        """
+        if value in self.values_dict:
+            self.values = self.values_dict[value]
         else:
-            values_dict[family_name] = models
+            self.values = [value]
+    @property
+    def values_dict(self):
+        values_dict = {
+            "full_functionality_text": get_model_names_with_tag(FULL_FUNCTIONALITY_TEXT_MODEL_TAG),
+            "ai21/j1-jumbo": ["ai21/j1-jumbo"],
+            "openai/curie": ["openai/curie"],
+            "chat_run": ["openai/chat-gpt", "openai/text-davinci-003"],  # Compare ChatGPT to text-davinci-003
+            "all": get_all_models(),
+            "text_code": get_all_text_models() + get_all_code_models(),
+            "text": get_all_text_models(),
+            "code": get_all_code_models(),
+            "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
+            "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
+            "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
+            "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
+            "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
+            "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
+            "biomedical": ["openai/text-davinci-003"],  # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
+            "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
+            "opinions_qa_openai": [
+                "openai/ada",
+                "openai/davinci",
+                "openai/text-ada-001",
+                "openai/text-davinci-001",
+                "openai/text-davinci-002",
+                "openai/text-davinci-003",
+            ],
+            "opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
+        }
+        # For each of the keys above (e.g., "text"), create a corresponding ablation (e.g., "ablation_text")
+        # which contains the subset of models with the ablation tag.
+        ablation_models = set(get_model_names_with_tag(ABLATION_MODEL_TAG))
+        ablation_values_dict = {}
+        for family_name, models in values_dict.items():
+            ablation_values_dict["ablation_" + family_name] = list(ablation_models & set(models))
+        for family_name, models in ablation_values_dict.items():
+            if family_name == "ablation_all":
+                values_dict["ablation"] = models
+            else:
+                values_dict[family_name] = models
+        return values_dict
 ############################################################
@@ -821,21 +843,113 @@ class NumOutputTokensRunExpander(RunExpander):
         ]
-RUN_EXPANDERS = dict(
-    (expander.name, expander)
-    for expander in [
-        InstructionsRunExpander,
-        PromptRunExpander,
-        NewlineRunExpander,
-        StopRunExpander,
-        GlobalPrefixRunExpander,
-        NumTrainTrialsRunExpander,
-        MaxTrainInstancesRunExpander,
-        NumOutputsRunExpander,
-        ModelRunExpander,
-        DataAugmentationRunExpander,
-        TokenizerRunExpander,
-        NumPromptTokensRunExpander,
-        NumOutputTokensRunExpander,
-    ]
-)
+class ChatMLRunExpander(RunExpander):
+    """
+    Adapt to ChatML: https://github.com/openai/openai-python/blob/main/chatml.md
+    A 1-shot example:
+    <|im_start|>system
+    Translate from English to French
+    <|im_end|>
+    <|im_start|>user
+    How are you?
+    <|im_end|>
+    <|im_start|>user
+    Comment allez-vous?
+    <|im_end|>
+    <|im_start|>user
+    {{user input here}}<|im_end|>
+    """
+    name = "chatml"
+    def __init__(self):
+        self.name = type(self).name
+    def expand(self, run_spec: RunSpec) -> List[RunSpec]:
+        adapter_spec = run_spec.adapter_spec
+        # according to https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
+        # few-shot examples should do `<|im_start|>system name=example_user`
+        # or `<|im_start|>system name=example_assistant`
+        # but it is also possible to put examples into a user message.
+        scenario_name = run_spec.name.split(":")[0]
+        if scenario_name in ("msmarco",):
+            # output_prefix:
+            #     Does the passage answer the query?
+            #     Answer:
+            #
+            # new_output_prefix:
+            #     Does the passage answer the query?<|im_end|>
+            #     <|im_start|>assistant
+            #     Answer:
+            new_output_prefix = (
+                adapter_spec.output_prefix.split("\n")[0]
+                + "<|im_end|>\n<|im_start|>assistant\n"
+                + adapter_spec.output_prefix.split("\n")[1]
+            )
+        elif scenario_name in ("summarization_cnndm", "summarization_xsum"):
+            # output_prefix:
+            #     Summarize the above article in 1 sentence.
+            #
+            # new_output_prefix:
+            #     Summarize the above article in 1 sentence.<|im_end|>
+            #     <|im_start|>assistant
+            #
+            new_output_prefix = adapter_spec.output_prefix + "<|im_end|>\n<|im_start|>assistant\n"
+        else:
+            # output_prefix:
+            #     {output_prefix}
+            #
+            # new_output_prefix:
+            #     <|im_end|>
+            #     <|im_start|>assistant
+            #     {output_prefix}
+            new_output_prefix = "<|im_end|>\n<|im_start|>assistant\n" + adapter_spec.output_prefix
+        adapter_spec = replace(
+            adapter_spec,
+            # This is a hack to make sure <|im_start|>user goes before the reference.
+            instructions=(
+                f"<|im_start|>system\n{adapter_spec.instructions}<|im_end|>\n<|im_start|>user\n"
+                if adapter_spec.instructions != ""
+                else "<|im_start|>user\n"
+            ),
+            instance_prefix="",
+            output_prefix=new_output_prefix,
+            output_suffix="<|im_end|>\n<|im_start|>user\n",
+            stop_sequences=adapter_spec.stop_sequences + ["<|im_end|>"],
+        )
+        return [
+            replace(
+                run_spec,
+                adapter_spec=adapter_spec,
+            ),
+        ]
+RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
+    InstructionsRunExpander,
+    PromptRunExpander,
+    NewlineRunExpander,
+    StopRunExpander,
+    GlobalPrefixRunExpander,
+    NumTrainTrialsRunExpander,
+    MaxTrainInstancesRunExpander,
+    NumOutputsRunExpander,
+    ModelRunExpander,
+    DataAugmentationRunExpander,
+    TokenizerRunExpander,
+    NumPromptTokensRunExpander,
+    NumOutputTokensRunExpander,
+    ChatMLRunExpander,
+]
+RUN_EXPANDERS = dict((expander.name, expander) for expander in RUN_EXPANDER_SUBCLASSES)

helm/benchmark/run_specs.py CHANGED Viewed

@@ -14,12 +14,13 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
 from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
 from helm.benchmark.adaptation.adapter_spec import AdapterSpec
 from .metrics.metric import MetricSpec
-from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander
+from .run_expander import RUN_EXPANDERS, GlobalPrefixRunExpander, StopRunExpander, ChatMLRunExpander
 from .runner import RunSpec
 from .scenarios.lex_glue_scenario import (
     get_lex_glue_max_train_instances,
     get_lex_glue_instructions,
     get_lex_glue_max_tokens,
+    get_lex_glue_task_type,
 )
 from .scenarios.scenario import ScenarioSpec
 from .scenarios.big_bench_scenario import BIGBenchScenario
@@ -31,8 +32,10 @@ from .scenarios.lextreme_scenario import (
     get_lextreme_instructions,
     get_lextreme_max_train_instances,
     get_lextreme_max_tokens,
+    TaskType,
+    get_lextreme_task_type,
 )
-from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG
+from helm.proxy.models import get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG
 from helm.common.general import singleton
@@ -47,7 +50,14 @@ def format_instructions(instructions: str) -> str:
 def get_multiple_choice_joint_adapter_spec(
-    instructions: str, input_noun: Optional[str], output_noun: str, max_train_instances: int = 5, **kwargs
+    instructions: str,
+    input_noun: Optional[str],
+    output_noun: str,
+    num_outputs: int = 5,
+    max_train_instances: int = 5,
+    max_tokens: int = 5,
+    sample_train: bool = True,
+    **kwargs,
 ) -> AdapterSpec:
     """
     [instructions]
@@ -64,6 +74,7 @@ def get_multiple_choice_joint_adapter_spec(
     [reference_k]
     [output_noun]:
     """
     return AdapterSpec(
         method=ADAPT_MULTIPLE_CHOICE_JOINT,
         instructions=format_instructions(instructions),
@@ -72,10 +83,11 @@ def get_multiple_choice_joint_adapter_spec(
         output_prefix=f"{output_noun}: ",
         output_suffix="\n",
         max_train_instances=max_train_instances,
-        num_outputs=1,
-        max_tokens=5,
+        num_outputs=num_outputs,
+        max_tokens=max_tokens,
         temperature=0.0,
         stop_sequences=["\n"],
+        sample_train=sample_train,
         **kwargs,
     )
@@ -109,15 +121,26 @@ def get_multiple_choice_adapter_spec(
     input_noun: Optional[str],
     output_noun: str,
     max_train_instances: int = 5,
+    num_outputs: int = 5,
+    max_tokens: int = 5,
     empty_input: bool = False,
+    sample_train: bool = True,
     **kwargs,
 ):
     """
     Toggle between joint and separate adapters.
     """
     if method == ADAPT_MULTIPLE_CHOICE_JOINT:
         return get_multiple_choice_joint_adapter_spec(
-            instructions, input_noun, output_noun, max_train_instances, **kwargs
+            instructions,
+            input_noun,
+            output_noun,
+            max_train_instances=max_train_instances,
+            num_outputs=num_outputs,
+            max_tokens=max_tokens,
+            sample_train=sample_train,
+            **kwargs,
         )
     elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
         return get_multiple_choice_separate_adapter_spec(method, empty_input)
@@ -385,8 +408,12 @@ def get_f1_metric_specs() -> List[MetricSpec]:
     return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
-def get_classification_metric_specs() -> List[MetricSpec]:
-    return [MetricSpec(class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={})]
+def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
+    return [
+        MetricSpec(
+            class_name="helm.benchmark.classification_metrics.ClassificationMetric", args={"delimiter": delimiter}
+        )
+    ]
 def get_bbq_metric_specs() -> List[MetricSpec]:
@@ -1724,6 +1751,14 @@ def get_pubmed_qa_spec() -> RunSpec:
     )
+def build_classification_metrics(task_type):
+    if task_type in [TaskType.QA, TaskType.SLTC]:
+        return get_classification_metric_specs(delimiter=None)
+    elif task_type == TaskType.MLTC:
+        return get_classification_metric_specs(delimiter=",")
+    return []
 def get_lextreme_spec(subset: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.lextreme_scenario.LEXTREMEScenario",
@@ -1742,7 +1777,7 @@ def get_lextreme_spec(subset: str) -> RunSpec:
         name=f"lextreme:subset={subset}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_f1_metric_specs(),
+        metric_specs=build_classification_metrics(get_lextreme_task_type(subset)),
         groups=["lextreme"],
     )
@@ -1765,7 +1800,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec:
         name=f"lex_glue:subset={subset}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
-        metric_specs=get_f1_metric_specs(),
+        metric_specs=build_classification_metrics(get_lex_glue_task_type(subset)),
         groups=["lex_glue"],
     )
@@ -1801,6 +1836,40 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec
     )
+def get_opinions_qa_spec(
+    survey_type: str,
+    num_logprobs: str,
+    context: str = "None",
+    num_train_trials: str = "1",
+    method: str = ADAPT_MULTIPLE_CHOICE_JOINT,
+) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.opinions_qa_scenario.OpinionsQAScenario",
+        args={"survey_type": survey_type, "context": context},
+    )
+    adapter_spec = get_multiple_choice_adapter_spec(
+        method=method,
+        instructions="",
+        input_noun="Question",
+        output_noun="Answer",
+        max_train_instances=1 if "steer" in context else 0,
+        max_tokens=1,
+        num_outputs=int(num_logprobs),
+        num_train_trials=1 if context != "steer-qa" else int(num_train_trials),
+        sample_train=False,
+    )
+    return RunSpec(
+        name=f"opinions_qa:survey={survey_type},num_logprobs={num_logprobs}"
+        + f",context={context},num_train_trials={num_train_trials}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=[],
+        groups=["opinions_qa"],
+    )
 ############################################################
 CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
@@ -1858,6 +1927,7 @@ CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {
     "med_paragraph_simplification": get_med_paragraph_simplification_spec,
     "med_qa": get_med_qa_spec,
     "pubmed_qa": get_pubmed_qa_spec,
+    "opinions_qa": get_opinions_qa_spec,
 }
@@ -1900,6 +1970,10 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
             global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
             run_spec = singleton(global_prefix_expander.expand(run_spec))
+        if CHATML_MODEL_TAG in model.tags:
+            chatml_expander = ChatMLRunExpander()
+            run_spec = singleton(chatml_expander.expand(run_spec))
         return run_spec
     run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]

helm/benchmark/runner.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
+import traceback
 import typing
 from collections import Counter
 from dataclasses import dataclass, field
@@ -25,6 +26,12 @@ from .metrics.tokens_metric import TokensMetric
 from .window_services.tokenizer_service import TokenizerService
+class RunnerError(Exception):
+    """Error that happens in the Runner."""
+    pass
 @dataclass(frozen=True)
 class RunSpec:
     """
@@ -71,12 +78,16 @@ class Runner:
         output_path: str,
         suite: str,
         skip_instances: bool,
+        skip_completed_runs: bool,
+        exit_on_error: bool,
     ):
         self.executor = Executor(execution_spec)
         self.dry_run: bool = execution_spec.dry_run
         self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
         self.metric_service = MetricService(self.executor.service, execution_spec.auth)
         self.skip_instances: bool = skip_instances
+        self.skip_completed_runs: bool = skip_completed_runs
+        self.exit_on_error: bool = exit_on_error
         ensure_directory_exists(output_path)
         # Decide where to save the raw data (e.g., "output/scenarios/mmlu").
@@ -91,9 +102,20 @@ class Runner:
         ensure_directory_exists(self.eval_cache_path)
     def run_all(self, run_specs: List[RunSpec]):
-        for run_spec in tqdm(run_specs):
-            with htrack_block(f"Running {run_spec.name}"):
-                self.run_one(run_spec)
+        failed_run_specs: List[RunSpec] = []
+        for run_spec in tqdm(run_specs, disable=None):
+            try:
+                with htrack_block(f"Running {run_spec.name}"):
+                    self.run_one(run_spec)
+            except Exception as e:
+                if self.exit_on_error:
+                    raise e
+                else:
+                    hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
+                    failed_run_specs.append(run_spec)
+        if not self.exit_on_error and failed_run_specs:
+            failed_runs_str = ", ".join([f'"{run_spec.name}"' for run_spec in failed_run_specs])
+            raise RunnerError(f"Failed runs: [{failed_runs_str}]")
     def run_one(self, run_spec: RunSpec):
         # Load the scenario
@@ -106,6 +128,12 @@ class Runner:
         run_path: str = os.path.join(self.runs_path, run_spec.name)
         ensure_directory_exists(run_path)
+        if self.skip_completed_runs and os.path.exists(os.path.join(run_path, "scenario_state.json")):
+            # If scenario_state.json exists, assume that all other output files exist
+            # because scenario_state.json is the last output file to be written.
+            hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
+            return
         # Fetch and initialize the Adapter based on the `AdapterSpec`.
         adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)

helm/benchmark/scenarios/copyright_scenario.py CHANGED Viewed

@@ -72,7 +72,7 @@ class CopyrightScenario(Scenario):
         # Read all the instances
         instances: List[Instance] = []
-        for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances"):
+        for prefix, prefix_to_end in tqdm.tqdm(data["data"].items(), desc="load instances", disable=None):
             instances.append(
                 Instance(
                     input=Input(text=prefix),

crfm-helm 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

crfm-helm 0.2.1py3-none-any.whl → 0.2.2py3-none-any.whl