PyPI - crfm-helm - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

helm/benchmark/scenarios/lm_entry_scenario.py ADDED Viewed

@@ -0,0 +1,185 @@
+import json
+import os
+from typing import List
+from helm.common.general import ensure_file_downloaded
+from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
+class LMEntryScenario(Scenario):
+    """
+    The LMentry Benchmark
+    https://arxiv.org/pdf/2211.02069.pdf
+    The implementation is with reference to the original repo: https://github.com/aviaefrat/lmentry
+    The data is also downloaded from the repo.
+    LMentry evaluates LM's abilities of performing elementary language tasks. Examples include
+    finding which word is shorter, or which word is the last in a sentence.
+    """
+    name = "lm_entry"
+    description = "The LMentry benchmark for elementary language tasks"
+    tags: List[str] = []
+    url_template = "https://raw.githubusercontent.com/aviaefrat/lmentry/main/data/{subtask}.json"
+    task_to_subtasks = {
+        "all_words_from_category": [
+            "all_words_from_category",
+            "all_words_from_category_0_distractors",
+            "all_words_from_category_1_distractors",
+            "all_words_from_category_2_distractors",
+        ],
+        "any_words_from_category": [
+            "any_words_from_category",
+            "any_words_from_category_3_distractors",
+            "any_words_from_category_4_distractors",
+            "any_words_from_category_5_distractors",
+        ],
+        "bigger_number": ["bigger_number"],
+        # "ends_with_letter": ["ends_with_letter"], # HELM's metrics currently don't support this
+        # "ends_with_word": ["ends_with_word"], # HELM's metrics currently don't support this
+        "first_alphabetically": [
+            "first_alphabetically",
+            "first_alphabetically_consecutive_first_letter",
+            "first_alphabetically_different_first_letter",
+            "first_alphabetically_far_first_letter",
+            "first_alphabetically_same_first_letter",
+        ],
+        "first_letter": ["first_letter"],
+        "first_word": ["first_word"],
+        "homophones": ["homophones"],
+        "last_letter": ["last_letter"],
+        "last_word": ["last_word"],
+        "least_associated_word": ["least_associated_word"],
+        "less_letters": ["less_letters", "less_letters_length_diff_1", "less_letters_length_diff_3plus"],
+        "more_letters": ["more_letters", "more_letters_length_diff_1", "more_letters_length_diff_3plus"],
+        "most_associated_word": ["most_associated_word"],
+        "rhyming_word": [
+            "rhyming_word",
+            "rhyming_word_orthographically_different",
+            "rhyming_word_orthographically_similar",
+        ],
+        # "sentence_containing": ["sentence_containing"], # HELM's metrics currently don't support this
+        # "sentence_not_containing": ["sentence_not_containing"], # HELM's metrics currently don't support this
+        "smaller_number": ["smaller_number"],
+        # "starts_with_letter": ["starts_with_letter"], # HELM's metrics currently don't support this
+        # "starts_with_word": ["starts_with_word"], # HELM's metrics currently don't support this
+        "word_after": ["word_after"],
+        "word_before": ["word_before"],
+        # "word_containing": ["word_containing"], # HELM's metrics currently don't support this
+        # "word_not_containing": ["word_not_containing"], # HELM's metrics currently don't support this
+    }
+    def __init__(self, task: str):
+        super().__init__()
+        assert task in self.task_to_subtasks, f"Unsupported task: {task}"
+        self.task: str = task
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data
+        data_paths: List[str] = []
+        for subtask in self.task_to_subtasks[self.task]:
+            data_path: str = os.path.join(output_path, f"{subtask}.json")
+            ensure_file_downloaded(
+                source_url=self.url_template.format(subtask=subtask),
+                target_path=data_path,
+                unpack=False,
+            )
+            data_paths.append(data_path)
+        def generate_references_for_multiple_choice_question(
+            options: List[str], correct_answer: str
+        ) -> List[Reference]:
+            references: List[Reference] = []
+            for option in options:
+                if option == correct_answer:
+                    references.append(Reference(Output(text=option), tags=[CORRECT_TAG]))
+                else:
+                    references.append(Reference(Output(text=option), tags=[]))
+            return references
+        def generate_references_for_generation_question(correct_answer: str) -> List[Reference]:
+            return generate_references_for_multiple_choice_question(
+                options=[correct_answer], correct_answer=correct_answer
+            )
+        instances: List[Instance] = []
+        for data_path in data_paths:
+            data: dict = json.load(open(data_path))
+            for example in data["examples"].values():
+                input_text: str = example["input"]
+                # Normalize the input text to the same format
+                if input_text.startswith("Q: "):
+                    input_text = input_text[3:]
+                if input_text.endswith("\n"):
+                    input_text = input_text[:-1]
+                elif input_text.endswith("\nA:"):
+                    input_text = input_text[:-3]
+                input = Input(text=input_text)
+                references: List[Reference]
+                if self.task == "all_words_from_category":
+                    correct_answer = "yes" if example["metadata"]["num_distractors"] == 0 else "no"
+                    references = generate_references_for_multiple_choice_question(
+                        options=["yes", "no"], correct_answer=correct_answer
+                    )
+                elif self.task == "any_words_from_category":
+                    correct_answer = "yes" if len(example["metadata"]["category_words"]) > 0 else "no"
+                    references = generate_references_for_multiple_choice_question(
+                        options=["yes", "no"], correct_answer=correct_answer
+                    )
+                elif self.task == "bigger_number" or self.task == "smaller_number":
+                    references = generate_references_for_multiple_choice_question(
+                        options=[str(example["metadata"]["n1"]), str(example["metadata"]["n2"])],
+                        correct_answer=str(example["metadata"]["answer"]),
+                    )
+                elif self.task == "first_alphabetically":
+                    references = generate_references_for_multiple_choice_question(
+                        options=[example["metadata"]["word1"], example["metadata"]["word2"]],
+                        correct_answer=example["metadata"]["answer"],
+                    )
+                elif self.task == "first_letter" or self.task == "last_letter":
+                    references = generate_references_for_generation_question(example["metadata"]["answer"])
+                elif self.task == "first_word" or self.task == "last_word":
+                    references = generate_references_for_generation_question(example["metadata"]["answer"])
+                elif self.task == "homophones":
+                    references = generate_references_for_multiple_choice_question(
+                        options=[example["metadata"]["answer"], example["metadata"]["distractor"]],
+                        correct_answer=example["metadata"]["answer"],
+                    )
+                elif self.task == "least_associated_word" or self.task == "most_associated_word":
+                    references = generate_references_for_multiple_choice_question(
+                        options=[example["metadata"]["answer"]] + example["metadata"]["distractors"],
+                        correct_answer=example["metadata"]["answer"],
+                    )
+                elif self.task == "less_letters" or self.task == "more_letters":
+                    references = generate_references_for_multiple_choice_question(
+                        options=[example["metadata"]["word1"], example["metadata"]["word2"]],
+                        correct_answer=example["metadata"]["answer"],
+                    )
+                elif self.task == "rhyming_word":
+                    references = generate_references_for_multiple_choice_question(
+                        options=[example["metadata"]["answer"], example["metadata"]["distractor"]],
+                        correct_answer=example["metadata"]["answer"],
+                    )
+                elif self.task == "word_before" or self.task == "word_after":
+                    references = generate_references_for_generation_question(example["metadata"]["answer"])
+                else:
+                    raise ValueError(f"Unsupported task: {self.task}")
+                instance = Instance(
+                    input=input,
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+                instances.append(instance)
+        return instances

helm/benchmark/scenarios/lsat_qa_scenario.py CHANGED Viewed

@@ -101,9 +101,11 @@ class LSATScenario(Scenario):
     def get_question_types(self, tags: List[str]) -> List[str]:
         question_type: str = tags[2].replace("grouping (distribution)", "distribution grouping") or "miscellaneous"
+        types = [question_type.replace(" ", "_").replace("/", "_")]
         main_type = self.subtype2type.get(question_type)
-        assert main_type
-        return [question_type.replace(" ", "_").replace("/", "_"), main_type]
+        if main_type is not None:
+            types.append(main_type)
+        return types
     def get_instances(self, output_path: str) -> List[Instance]:
         data_path = os.path.join(output_path, "data")

helm/benchmark/scenarios/math_scenario.py CHANGED Viewed

@@ -1,9 +1,20 @@
 import collections
+import os
 import typing
 from typing import Dict, List, Optional
 from datasets import load_dataset, DatasetDict
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.common.general import ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 def remove_boxed(string: str) -> Optional[str]:
@@ -354,7 +365,13 @@ class MATHScenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         dataset = {}
-        data = typing.cast(DatasetDict, load_dataset("competition_math", ignore_verifications=True))
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        data = (
+            typing.cast(DatasetDict, load_dataset("competition_math", cache_dir=cache_dir))
+            .sort("problem")
+            .shuffle(seed=42)
+        )
         def group_by_key(dataset_list, key):
             dataset_per_key = collections.defaultdict(list)

helm/benchmark/scenarios/medication_qa_scenario.py ADDED Viewed

@@ -0,0 +1,60 @@
+import os
+from typing import List
+import pandas as pd
+from helm.common.general import ensure_file_downloaded
+from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
+class MedicationQAScenario(Scenario):
+    """
+    The gold standard corpus for medication question answering introduced in the MedInfo 2019 paper
+    "Bridging the Gap between Consumers’ Medication Questions and Trusted Answers":
+    http://ebooks.iospress.nl/publication/51941
+    This dataset has consumer questions, as opposed to very clinical questions.
+    Paper citation:
+        @inproceedings{BenAbacha:MEDINFO19,
+        author    = {Asma {Ben Abacha} and Yassine Mrabet and Mark Sharp and
+                    Travis Goodwin and Sonya E. Shooshan and Dina Demner{-}Fushman},
+        title     = {Bridging the Gap between Consumers’ Medication Questions and Trusted Answers},
+        booktitle = {MEDINFO 2019},
+        year      = {2019},
+        }
+    """
+    SOURCE_REPO_URL = "https://github.com/abachaa/Medication_QA_MedInfo2019/raw/master/"
+    FILENAME = "MedInfo2019-QA-Medications.xlsx"
+    name = "medication_qa"
+    description = "MedInfo 2019 MedicationQA medication question answering task"
+    tags = ["knowledge", "generation", "question_answering", "biomedical"]
+    def download_medication_qa(self, path: str):
+        """download the .xlsx spreadsheet containing the question-answer pairs"""
+        ensure_file_downloaded(
+            source_url=os.path.join(self.SOURCE_REPO_URL, self.FILENAME),
+            target_path=os.path.join(path, self.FILENAME),
+            unpack=False,
+        )
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.download_medication_qa(output_path)
+        data_path = os.path.join(output_path, self.FILENAME)
+        data = pd.read_excel(data_path)
+        data = data[~data.Answer.isna()]  # remove rows missing answers
+        instances = [
+            Instance(
+                input=Input(row.Question),
+                references=[Reference(Output(row.Answer), tags=[CORRECT_TAG])],
+                split=TEST_SPLIT,
+            )
+            for _, row in data.iterrows()
+        ]
+        return instances

helm/benchmark/scenarios/numeracy_scenario.py CHANGED Viewed

@@ -3,7 +3,7 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 from itertools import combinations_with_replacement, product
 import math
-from math import comb  # type: ignore
+from math import comb
 import numpy as np
 import numpy.typing as npt
 import random
@@ -358,7 +358,7 @@ def distance_paraboloid(point: List[int], rel_str: str, TOL: float = 1e-10):
         sols = []
         # Try each possible combined solution for x, y, z, λ
         for sol_xyz, val_λs in zip(sols_xyz, vals_λ):
-            val_λs = list(set(filter(lambda _: not _.is_symbol, val_λs)))  # get distinct values for λ if there are any
+            val_λs = tuple(set(filter(lambda _: not _.is_symbol, val_λs)))  # get distinct values for λ if there are any
             if len(val_λs) > 1:  # there can be at most one distinct value for λ
                 continue
             val_λ = val_λs[0] if val_λs else λ
@@ -544,7 +544,7 @@ def get_numeracy_adapter_spec(
                 "max_eval_instances": max_eval_instances,
                 "num_outputs": 1,
                 "num_train_trials": 1,
-                "model": "openai/davinci",
+                "model_deployment": "openai/davinci",
                 "temperature": 0,
                 "stop_sequences": ["\n"],
                 "max_tokens": 20,

helm/benchmark/scenarios/opinions_qa_scenario.py CHANGED Viewed

@@ -107,18 +107,17 @@ class OpinionsQAScenario(Scenario):
         self.survey_type: str = survey_type
         self.context: str = context
-    def download_data(self):
-        output_path: str = os.path.join(output_path, "data")
-        if not os.path.exists(output_path):
-            os.makedirs(output_path)
+    def download_data(self, output_path: str):
+        data_dir: str = os.path.join(output_path, "data")
+        if not os.path.exists(data_dir):
+            os.makedirs(data_dir)
         DOWNLOAD_FILENAMES = [self.FILE_NAME.format(wave=wave) for wave in self.PEW_SURVEY_WAVES]
         DOWNLOAD_FILENAMES += [f"{steer}.csv" for steer in ["steer-qa", "steer-bio", "steer-portray"]]
         DOWNLOAD_FILENAMES += ["Pew_American_Trends_Panel_disagreement_500.csv"]
         for filename in DOWNLOAD_FILENAMES:
-            data_path: str = os.path.join(output_path, filename)
+            data_path: str = os.path.join(data_dir, filename)
             source_url: str = self.CODALAB_URI_TEMPLATE.format(bundle=self.CODALAB_BUNDLE, filename=filename)
             ensure_file_downloaded(source_url=source_url, target_path=data_path, downloader_executable="gdown")
@@ -129,7 +128,7 @@ class OpinionsQAScenario(Scenario):
         return df
     def get_instances(self, output_path: str) -> List[Instance]:
-        self.download_data()
+        self.download_data(output_path)
         # Read all the instances
         instances: List[Instance] = []
@@ -150,14 +149,12 @@ class OpinionsQAScenario(Scenario):
             bios_df = pd.read_csv(bios_path, sep="\t")
         for split in all_splits:
             csv_path: str = csv_dict[split]
             assert os.path.exists(csv_path)
             question_df = self.read_survey_questions(csv_path)
             for qidx, (question, answers) in enumerate(zip(question_df["question"], question_df["options"])):
                 # Opinions QA test questions have no correct answer and thus we set it to be None by default
                 # for all test instances.
                 # In the case where context = steer-qa, we add demographic information in the form of a
@@ -182,7 +179,6 @@ class OpinionsQAScenario(Scenario):
                 else:
                     # context = "steer-bio"or "steer-portray"
                     for bio in bios_df["question"].values:
                         context = PassageQuestionInput(passage=bio, question=question + "\n")
                         instance = Instance(
                             context,

helm/benchmark/scenarios/raft_scenario.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import random
 import os
 import json
-import tempfile
 import datasets
 from pathlib import Path
 from typing import List, Dict
@@ -26,12 +25,9 @@ SUBSETS = [
 ]
-def get_raft_prompt_settings(subset: str, cache_dir=None):
+def get_raft_prompt_settings(subset: str, cache_dir: str):
     assert subset in SUBSETS, "Unknown subset: {}".format(subset)
-    if cache_dir is None:
-        cache_dir = tempfile.gettempdir()
     prompt_construction_settings_path = os.path.join(cache_dir, "prompt_construction_settings.json")
     ensure_directory_exists(cache_dir)
     ensure_file_downloaded(
@@ -44,7 +40,7 @@ def get_raft_prompt_settings(subset: str, cache_dir=None):
     return field_ordering[subset], instructions[subset]
-def get_raft_instructions(subset: str, cache_dir=None):
+def get_raft_instructions(subset: str, cache_dir: str):
     return get_raft_prompt_settings(subset, cache_dir)[1]

helm/benchmark/scenarios/scenario.py CHANGED Viewed

@@ -1,12 +1,13 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field, replace
 from typing import List, Optional, Tuple
+import os
 from pathlib import PurePath
 import inspect
 from helm.common.media_object import MultimediaObject
 from helm.common.object_spec import ObjectSpec, create_object
-from helm.common.general import format_text, format_split, format_tags, indent_lines
+from helm.common.general import ensure_directory_exists, format_text, format_split, format_tags, indent_lines
 from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
 """ Data splits """
@@ -24,6 +25,10 @@ DEFAULT_TEST_SIZE: int = 1000
 """ Reference tags """
 CORRECT_TAG: str = "correct"
+""" Asset tags (used for compiled outputs such as image2structure)"""
+ASSET_NAME_TAG: str = "asset_name"
+ASSET_PATH_TAG: str = "asset_path"
 # Reference tag functions for ranking scenarios.
 # @TODO: (For future) Should there be a base RankingScenario class?
@@ -177,7 +182,7 @@ class Instance:
 # TODO(#1212): Scenario should not be a dataclass.
-@dataclass  # type: ignore
+@dataclass
 class Scenario(ABC):
     """
     A scenario represents a (task, data distribution).
@@ -249,3 +254,10 @@ class ScenarioSpec(ObjectSpec):
 def create_scenario(scenario_spec: ScenarioSpec) -> Scenario:
     """Construct the scenario and set some fields."""
     return create_object(scenario_spec)
+def get_scenario_cache_path(benchmark_output_path: str, scenario_name: str):
+    """Return a directory under benchmark_output_path in which Scenario can cache temporary data."""
+    scenarios_path: str = os.path.join(benchmark_output_path, "scenarios", scenario_name)
+    ensure_directory_exists(scenarios_path)
+    return scenarios_path

helm/benchmark/scenarios/simple_scenarios.py CHANGED Viewed

@@ -1,7 +1,128 @@
+"""Simple scenarios for debugging and for tutorials.
+NOTE: Typically, each scenario should be in its own file,
+but these scenarios are placed in the same module for
+tutorial purposes."""
 import random
 from typing import List
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class SimpleMCQAScenario(Scenario):
+    """Simple multiple-choice question answering scenario for tutorials and debugging.
+    The task is to answer questions about whether two-digit numbers are even or odd.
+    Example:
+        Answer the following questions with a single letter only.
+        Question: Is 24 even or odd?
+        A. Even
+        B. Odd
+        Answer: A"""
+    name = "simple_mcqa"
+    description = "Answer if two-digit numbers are even or odd."
+    tags = ["question answering"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for i in range(10, 100):
+            # NOTE: For simplicity, the input text and reference output text
+            # is the same for all instances.
+            # However, for most question answering scenarios, the input text
+            # and reference output text can vary between questions.
+            input = Input(text=f"Is {i} even or odd?")
+            references = [
+                Reference(Output(text="Even"), tags=[CORRECT_TAG] if i % 2 == 0 else []),
+                Reference(Output(text="Odd"), tags=[CORRECT_TAG] if i % 2 == 1 else []),
+            ]
+            split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
+            instance = Instance(input=input, references=references, split=split)
+            instances.append(instance)
+        return instances
+class SimpleShortAnswerQAScenario(Scenario):
+    """Simple short answer question answering scenario for tutorials and debugging.
+    The task is to answer questions about whether two-digit numbers are even or odd.
+    Example:
+        Answer the following questions with a single word only.
+        Question: Is 24 even or odd?
+        Answer: Even"""
+    name = "simple_mcqa"
+    description = "Answer if two-digit numbers are even or odd."
+    tags = ["question answering"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for i in range(10, 100):
+            # NOTE: For simplicity, the input text and reference output text
+            # is the same for all instances.
+            # However, for most question answering scenarios, the input text
+            # and reference output text can vary between questions.
+            input = Input(text=f"Is {i} even or odd?")
+            correct_answer = "Even" if i % 2 == 0 else "Odd"
+            # NOTE: Unlike multiple-choice question answering, only the correct
+            # references are needed for short-answer question answering.
+            references = [
+                Reference(Output(text=correct_answer), tags=[CORRECT_TAG]),
+            ]
+            split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
+            instance = Instance(input=input, references=references, split=split)
+            instances.append(instance)
+        return instances
+class SimpleClassificationScenario(Scenario):
+    """Simple multiple-choice question answering scenario for tutorials and debugging.
+    The task is to classify two-digit numbers as even or odd.
+    Example:
+        Classify the following numbers by their pairity. The classes are "Even" and "Odd".
+        Number: 24
+        Pairity: Even"""
+    name = "simple_classification"
+    description = "Classify numbers by pairity."
+    tags = ["classification"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        for i in range(10, 100):
+            input = Input(text=str(i))
+            # NOTE: For classification scenarios, the reference outputs should be the same
+            # for all instances, and should include both correct and incorrect classes.
+            # HELM only supports single-label classification. Exactly one reference
+            # should have the CORRECT_TAG tag.
+            references = [
+                Reference(Output(text="Even"), tags=[CORRECT_TAG] if i % 2 == 0 else []),
+                Reference(Output(text="Odd"), tags=[CORRECT_TAG] if i % 2 == 1 else []),
+            ]
+            split = TRAIN_SPLIT if i <= 20 else TEST_SPLIT
+            instance = Instance(input=input, references=references, split=split)
+            instances.append(instance)
+        return instances
 class Simple1Scenario(Scenario):

helm/benchmark/scenarios/test_math_scenario.py ADDED Viewed

@@ -0,0 +1,22 @@
+import pytest
+from tempfile import TemporaryDirectory
+from helm.benchmark.scenarios.math_scenario import MATHScenario
+from helm.benchmark.scenarios.scenario import Input, Output, Reference
+# TODO: Fix the test for newer versions of diffusers: https://github.com/stanford-crfm/helm/issues/2168
+@pytest.mark.skip(
+    reason="Incompatible with newer versions with diffusers>0.24.0. Fails with "
+    '"Loading a dataset cached in a LocalFileSystem is not supported"'
+)
+def test_math_scenario_get_instances():
+    math_scenario = MATHScenario(subject="number_theory", level="1")
+    with TemporaryDirectory() as tmpdir:
+        actual_instances = math_scenario.get_instances(tmpdir)
+    assert len(actual_instances) == 77
+    assert actual_instances[0].input == Input(text="What is the remainder when (99)(101) is divided by 9?")
+    assert actual_instances[0].references == [
+        Reference(output=Output(text="0", multimedia_content=None), tags=["correct"])
+    ]
+    assert actual_instances[0].split == "train"

helm/benchmark/scenarios/test_scenario.py CHANGED Viewed

@@ -1,10 +1,13 @@
-from helm.benchmark.run_specs import get_scenario_spec_tiny
-from helm.benchmark.scenarios.scenario import create_scenario, Scenario, Input, PassageQuestionInput
+from helm.benchmark.scenarios.scenario import ScenarioSpec, create_scenario, Scenario, Input, PassageQuestionInput
 class TestScenario:
     def setup_method(self, method):
-        self.scenario: Scenario = create_scenario(get_scenario_spec_tiny())
+        scenario_spec: ScenarioSpec = ScenarioSpec(
+            class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
+            args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2},
+        )
+        self.scenario: Scenario = create_scenario(scenario_spec)
     def test_render_lines(self):
         instances = self.scenario.get_instances(output_path="")

crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

crfm-helm 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl