PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

helm/benchmark/scenarios/ice_scenario.py CHANGED Viewed

@@ -5,8 +5,8 @@ from enum import Enum
 import pandas as pd
 from helm.common.optional_dependencies import handle_module_not_found_error
-from .ice_scenario_pinned_file_order import listdir_with_pinned_file_order
-from .scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
 try:
     # pd.read_excel() uses xlrd
@@ -114,8 +114,12 @@ class ICEScenario(Scenario):
     """
     The International Corpus of English (ICE).
-    NOTE: This text cannot be downloaded
-    automatically. You must extract each subset zip file into /benchmark_output/scenarios/ice.
+    NOTE: This text cannot be downloaded automatically.
+    You must extract each subset zip file into args.output_path + '/scenarios/ice',
+    which is by default '/benchmark_output/scenarios/ice',
+    where args.output_path is parsed from the command line argument.
+    See helm.benchmark.runner for more details about args.output_path.
     The archives should extract into folders named according to the dictionary SUBSET_TO_DIRECTORY
     below.

helm/benchmark/scenarios/ifeval_scenario.py ADDED Viewed

@@ -0,0 +1,53 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    TEST_SPLIT,
+)
+from helm.common.general import ensure_directory_exists
+class IFEvalScenario(Scenario):
+    """IFEval
+    IFEval contains around 500 "verifiable instructions" such as "write in more than 400 words"
+    and "mention the keyword of AI at least 3 times" which can be verified by heuristics."""
+    name = "ifeval"
+    description = "Instruction-Following Evaluation for Large Language Models"
+    tags = ["instruction following"]
+    def __init__(self):
+        super().__init__()
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get IFEval from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "google/IFEval",
+            trust_remote_code=True,
+            cache_dir=cache_dir,
+            split="train",
+            revision="966cd89545d6b6acfd7638bc708b98261ca58e84",
+        )
+        assert isinstance(dataset, datasets.Dataset)
+        # Read all instances
+        instances: List[Instance] = []
+        for _, row in enumerate(dataset):
+            id = row["key"]
+            input = Input(text=row["prompt"].strip())
+            instance = Instance(
+                id=f"id{id}",
+                input=input,
+                references=[],
+                split=TEST_SPLIT,
+                extra_data={"instruction_ids": row["instruction_id_list"], "instruction_kwargs": row["kwargs"]},
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/imdb_ptbr_scenario.py ADDED Viewed

@@ -0,0 +1,60 @@
+from typing import Any, List, Dict
+from pathlib import Path
+from datasets import load_dataset
+from helm.common.hierarchical_logger import hlog
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class IMDB_PTBRScenario(Scenario):
+    """
+    The IMDB dataset is a widely-used benchmark dataset for natural language processing (NLP)
+    particularly for text classification and sentiment analysis.
+    This is a translated version that is meant to evaluate PT-BR models.
+    It consists of movie reviews from the Internet Movie Database (IMDB) and
+    includes both positive and negative sentiments labeled for supervised learning.
+    """
+    name = "simple_classification"
+    description = "Classify movie reviews between positive or negative."
+    tags = ["classification"]
+    def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        label_names = {0: "negativo", 1: "positivo"}
+        for example in dataset[split]:
+            input = Input(text=example["text"])
+            # NOTE: For classification scenarios, the reference outputs should be the same
+            # for all instances, and should include both correct and incorrect classes.
+            # HELM only supports single-label classification. Exactly one reference
+            # should have the CORRECT_TAG tag.
+            references = [
+                Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
+            ]
+            instance = Instance(input=input, references=references, split=split)
+            instances.append(instance)
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        cache_dir = str(Path(output_path) / "data")
+        dataset = load_dataset("maritaca-ai/imdb_pt", cache_dir=cache_dir)
+        splits: Dict[str, str] = {
+            "train": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        for split in splits:
+            if split not in splits.keys():
+                hlog(f"{split} split doesn't exist, skipping")
+                continue
+            instances.extend(self.process_dataset(dataset, splits[split]))
+        return instances

helm/benchmark/scenarios/imdb_scenario.py CHANGED Viewed

@@ -2,8 +2,17 @@ import os
 from typing import List, Dict, Optional
 from helm.common.general import ensure_file_downloaded
-from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, Input, Output
-from .imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    Input,
+    Output,
+)
+from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
 class IMDBScenario(Scenario):

helm/benchmark/scenarios/infinite_bench_sum_scenario.py ADDED Viewed

@@ -0,0 +1,82 @@
+import os
+import re
+from typing import List
+from datasets import load_dataset, Features, Value, Sequence, Dataset
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Input,
+    Reference,
+    Output,
+    CORRECT_TAG,
+    TEST_SPLIT,
+)
+from helm.common.general import ensure_directory_exists
+class InfiniteBenchSumScenario(Scenario):
+    """InfiniteBench Sum
+    InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
+    understand, and reason over super long contexts (100k+ tokens). InfiniteBench Sum is a subset of
+    InfiniteBench that requires models to generate a concise summary of the novel. The subset is referred
+    to as "En.Sum" in the original paper.
+    """
+    name = "infinite_bench_sum"
+    description = "Summarize a novel from InfiniteBench"
+    tags = ["summarization"]
+    def __init__(self, min_num_words: int, max_num_words: int):
+        self.min_num_words = min_num_words
+        self.max_num_words = max_num_words
+        super().__init__()
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get InfiniteBench from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        # Define the features schema
+        ft = Features(
+            {
+                "id": Value("int64"),
+                "context": Value("string"),
+                "input": Value("string"),
+                "answer": Sequence(Value("string")),
+                "options": Sequence(Value("string")),
+            }
+        )
+        # Load the dataset with the specified features
+        dataset = load_dataset(
+            "xinrongzhang2022/InfiniteBench",
+            split="longbook_sum_eng",
+            features=ft,
+            revision="90f0394333616266d9fe85824ceaf505093cbaa5",
+        )
+        assert isinstance(dataset, Dataset)
+        def count_words(text: str) -> int:
+            return len(re.split(r"\s+", text.strip()))
+        dataset = dataset.map(
+            lambda example: {"prompt_wc": count_words(example["context"]) + count_words(example["input"])}
+        ).filter(lambda example: self.min_num_words <= example["prompt_wc"] <= self.max_num_words)
+        # Read all instances
+        instances: List[Instance] = []
+        for row in dataset:
+            id = row["id"]
+            input = Input(text=row["context"] + "\n\n" + row["input"])
+            instance = Instance(
+                id=id,
+                input=input,
+                references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
+                split=TEST_SPLIT,
+                extra_data={"word_count": row["prompt_wc"]},
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py CHANGED Viewed

@@ -2,8 +2,8 @@ import os
 from typing import List
 from helm.common.general import ensure_file_downloaded
-from .scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
-from .mmlu_scenario import MMLUScenario
+from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
+from helm.benchmark.scenarios.mmlu_scenario import MMLUScenario
 class InteractiveQAMMLUScenario(MMLUScenario):

helm/benchmark/scenarios/koala_scenario.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from typing import List
 from helm.common.general import ensure_file_downloaded
-from .scenario import Scenario, Instance, Input, TEST_SPLIT
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
 class KoalaScenario(Scenario):

helm/benchmark/scenarios/legal_contract_summarization_scenario.py ADDED Viewed

@@ -0,0 +1,129 @@
+import os
+import pandas as pd
+import json
+import re
+from typing import List
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Output,
+)
+class LegalContractSummarizationScenario(Scenario):
+    """Legal Contract Summarization
+    A legal contract summarization benchmark based on the paper
+    Plain English Summarization of Contracts (Manor & Li, NAACL 2019),
+    which presented a dataset of legal text snippets paired with summaries
+    written in plain English.
+    @inproceedings{manor-li-2019-plain,
+        title = "Plain {E}nglish Summarization of Contracts",
+        author = "Manor, Laura  and
+        Li, Junyi Jessy",
+        editor = "Aletras, Nikolaos  and
+        Ash, Elliott  and
+        Barrett, Leslie  and
+        Chen, Daniel  and
+        Meyers, Adam  and
+        Preotiuc-Pietro, Daniel  and
+        Rosenberg, David  and
+        Stent, Amanda",
+        booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2019",
+        month = jun,
+        year = "2019",
+        address = "Minneapolis, Minnesota",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/W19-2201",
+        doi = "10.18653/v1/W19-2201",
+        pages = "1--11",
+        abstract = "Unilateral legal contracts, such as terms of service, play a substantial role in modern digital life. However, few read these documents before accepting the terms within, as they are too long and the language too complicated. We propose the task of summarizing such legal documents in plain English, which would enable users to have a better understanding of the terms they are accepting. We propose an initial dataset of legal text snippets paired with summaries written in plain English. We verify the quality of these summaries manually, and show that they involve heavy abstraction, compression, and simplification. Initial experiments show that unsupervised extractive summarization methods do not perform well on this task due to the level of abstraction and style differences. We conclude with a call for resource and technique development for simplification and style transfer for legal language.",
+    }
+    """  # noqa: E501
+    TRAIN_RATIO: float = 0.2
+    ARTICLE_COLUMN_NAME = "original_text"
+    SUMMARY_COLUMN_NAME = "reference_summary"
+    ID_COLUMN_NAME = "uid"
+    name = "legal_contract_summarization"
+    description = (
+        "Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf)."
+    )
+    tags = ["summarization", "legal"]
+    def __init__(self):
+        """
+        Initializes the scenario.
+        """
+        super().__init__()
+    @staticmethod
+    def _clean(text: str) -> str:
+        return re.sub(r"\s+", " ", text)
+    def _load_dataset(self, output_path: str):
+        data_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(data_dir)
+        source_url = "https://raw.githubusercontent.com/lauramanor/legal_summarization/master/all_v1.json"
+        source_file = os.path.basename(source_url)
+        target_path = os.path.join(data_dir, source_file)
+        ensure_file_downloaded(
+            source_url=source_url,
+            target_path=target_path,
+        )
+        target_df = pd.DataFrame()
+        with open(target_path) as f:
+            json_data = json.load(f)
+            target_df = pd.DataFrame.from_records(list(json_data.values()))
+            target_df = target_df.dropna(
+                subset=[
+                    LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME,
+                    LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME,
+                    LegalContractSummarizationScenario.ID_COLUMN_NAME,
+                ]
+            )
+            # Split randomly (works better than split by order)
+            train_df = target_df.sample(frac=LegalContractSummarizationScenario.TRAIN_RATIO, random_state=0)
+            test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
+        return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
+    def get_instances(self, output_path: str) -> List[Instance]:
+        dataset = self._load_dataset(output_path)
+        instances: List[Instance] = []
+        for split, split_data in dataset.items():
+            for example in split_data.itertuples():
+                id = getattr(example, LegalContractSummarizationScenario.ID_COLUMN_NAME)
+                article = LegalContractSummarizationScenario._clean(
+                    getattr(example, LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME)
+                )
+                summary = LegalContractSummarizationScenario._clean(
+                    getattr(example, LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME)
+                )
+                input = Input(
+                    text=article,
+                )
+                output = Output(text=summary)
+                instance = Instance(
+                    id=id,
+                    input=input,
+                    references=[Reference(output=output, tags=[CORRECT_TAG])],
+                    split=split,
+                )
+                instances.append(instance)
+        return instances

helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py ADDED Viewed

@@ -0,0 +1,77 @@
+import os
+from typing import List
+import pandas as pd
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+class LegalOpinionSentimentClassificationScenario(Scenario):
+    """
+    A legal opinion sentiment classification task based on the paper
+    Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting
+    [(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
+    Example prompt:
+    Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative.
+    {Sentence}
+    Label: {positive/neutral/negative}
+    """
+    # Names of the tasks we support
+    name = "legal_opinion"
+    description = "Predicting the sentiment of the legal text in the positive, negative, or neutral."
+    tags = ["classification", "sentiment analysis", "legal"]
+    SENTIMENT_CLASSES = ["positive", "negative", "neutral"]
+    SPLIT_TO_URL = {
+        TRAIN_SPLIT: "https://osf.io/download/hfn62/",
+        TEST_SPLIT: "https://osf.io/download/q4adh/",
+    }
+    def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        assert split in [TRAIN_SPLIT, TEST_SPLIT]
+        if split == TRAIN_SPLIT:
+            phrase_column_name = "Phrase"
+            label_column_name = "Label"
+        else:
+            phrase_column_name = "sentence"
+            label_column_name = "label"
+        for row in df.itertuples():
+            phrase = getattr(row, phrase_column_name)
+            label_index = int(getattr(row, label_column_name))
+            label = LegalOpinionSentimentClassificationScenario.SENTIMENT_CLASSES[label_index]
+            instance = Instance(
+                input=Input(text=phrase), references=[Reference(Output(text=label), tags=[CORRECT_TAG])], split=split
+            )
+            instances.append(instance)
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        self.data_dir = os.path.join(output_path, "data")
+        data_dir = self.data_dir
+        ensure_directory_exists(data_dir)
+        instances: List[Instance] = []
+        for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items():
+            file_name = f"{split.lower()}.xlsx"
+            file_path = os.path.join(data_dir, file_name)
+            ensure_file_downloaded(
+                source_url=url,
+                target_path=os.path.join(data_dir, file_name),
+            )
+            df = pd.read_excel(file_path)
+            instances.extend(self.create_instances(df, split))
+        return instances

helm/benchmark/scenarios/legal_summarization_scenario.py CHANGED Viewed

@@ -5,7 +5,17 @@ from typing import List, Optional, Any
 import datasets
 from datasets import load_dataset
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 _ALL_LANGUAGES = {
     "bulgarian": "bg",

helm/benchmark/scenarios/legal_support_scenario.py CHANGED Viewed

@@ -3,7 +3,17 @@ import os
 from typing import List
 from helm.common.general import ensure_file_downloaded
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 class LegalSupportScenario(Scenario):

helm/benchmark/scenarios/legalbench_scenario.py CHANGED Viewed

@@ -6,7 +6,16 @@ from pathlib import Path
 from typing import List, Dict
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
-from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
 PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
@@ -97,10 +106,20 @@ class LegalBenchScenario(Scenario):
         # Download data from Huggingface. LegalBench provides splits for samples to
         # be used for prompt construction and for testing.
         train_dataset = datasets.load_dataset(
-            "nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="train"
+            "nguha/legalbench",
+            self.subset,
+            trust_remote_code=True,
+            cache_dir=cache_dir,
+            split="train",
+            revision="e042ea68c19df12b737fe768572f22ead61e8e37",
         )
         test_dataset = datasets.load_dataset(
-            "nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="test"
+            "nguha/legalbench",
+            self.subset,
+            trust_remote_code=True,
+            cache_dir=cache_dir,
+            split="test",
+            revision="e042ea68c19df12b737fe768572f22ead61e8e37",
         )
         assert isinstance(train_dataset, datasets.Dataset)
         assert isinstance(test_dataset, datasets.Dataset)

helm/benchmark/scenarios/lex_glue_scenario.py CHANGED Viewed

@@ -5,8 +5,18 @@ from typing import List, Any
 import datasets
 from datasets import load_dataset
-from .lextreme_scenario import TaskType
-from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Input, Output
+from helm.benchmark.scenarios.lextreme_scenario import TaskType
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
 ECTHR_A = "ecthr_a"
 ECTHR_B = "ecthr_b"

helm/benchmark/scenarios/lextreme_scenario.py CHANGED Viewed

@@ -6,7 +6,17 @@ from typing import List, Any
 import datasets
 from datasets import load_dataset
-from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, Output, Input
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    TEST_SPLIT,
+    Output,
+    Input,
+)
 class TaskType:

helm/benchmark/scenarios/live_qa_scenario.py CHANGED Viewed

@@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element
 import xml.etree.ElementTree as ET
 from helm.common.general import ensure_file_downloaded
-from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
 class LiveQAScenario(Scenario):

helm/benchmark/scenarios/lm_entry_scenario.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from typing import List
 from helm.common.general import ensure_file_downloaded
-from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
 class LMEntryScenario(Scenario):

helm/benchmark/scenarios/lsat_qa_scenario.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 from typing import Dict, List
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
-from .scenario import (
+from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
     Reference,

helm/benchmark/scenarios/math_scenario.py CHANGED Viewed

@@ -368,7 +368,15 @@ class MATHScenario(Scenario):
         cache_dir = os.path.join(output_path, "data")
         ensure_directory_exists(cache_dir)
         data = (
-            typing.cast(DatasetDict, load_dataset("competition_math", trust_remote_code=True, cache_dir=cache_dir))
+            typing.cast(
+                DatasetDict,
+                load_dataset(
+                    "hendrycks/competition_math",
+                    trust_remote_code=True,
+                    cache_dir=cache_dir,
+                    revision="71b758ecc688b2822d07ffa7f8393299f1dc7cac",
+                ),
+            )
             .sort("problem")
             .shuffle(seed=42)
         )

helm/benchmark/scenarios/me_q_sum_scenario.py CHANGED Viewed

@@ -2,7 +2,16 @@ import os
 from typing import List
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
-from .scenario import Scenario, Instance, Reference, ALL_SPLITS, CORRECT_TAG, VALID_SPLIT, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    ALL_SPLITS,
+    CORRECT_TAG,
+    VALID_SPLIT,
+    Input,
+    Output,
+)
 class MeQSumScenario(Scenario):

crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl