PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (606) hide show

helm/benchmark/scenarios/omni_math_scenario.py ADDED Viewed

@@ -0,0 +1,53 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    Input,
+    Output,
+    CORRECT_TAG,
+)
+from helm.common.general import ensure_directory_exists
+class OmniMATHScenario(Scenario):
+    """Omni-MATH: A Universal Olympiad Level Mathematic Benchmark for Large Language Models
+    Omni-MATH is a comprehensive and challenging benchmark specifically designed to assess LLMs' mathematical
+    reasoning at the Olympiad level. The dataset focuses exclusively on Olympiad mathematics and comprises a \
+    vast collection of 4428 competition-level problems. These problems are meticulously categorized into 33 \
+    (and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced \
+    analysis of model performance across various mathematical disciplines and levels of complexity.."""
+    name = "omni_math"
+    description = "A Universal Olympiad Level Mathematic Benchmark for Large Language Models"
+    tags = ["math"]
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Get Omni-MATH from HuggingFace
+        cache_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(cache_dir)
+        dataset = datasets.load_dataset(
+            "KbsdJames/Omni-MATH",
+            revision="40ba231d8f16e29ecd40e6407e2c8640145a8f62",
+            cache_dir=cache_dir,
+            split="test",
+        )
+        assert isinstance(dataset, datasets.Dataset)
+        # Read all instances
+        instances: List[Instance] = []
+        for idx, row in enumerate(dataset):
+            input = Input(text=row["problem"])
+            instance = Instance(
+                input=input,
+                references=[Reference(Output(text=row["answer"]), tags=[CORRECT_TAG])],
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances

helm/benchmark/scenarios/open_assistant_scenario.py CHANGED Viewed

@@ -2,7 +2,16 @@ from typing import List, Dict, Any, DefaultDict
 from datasets import load_dataset, Dataset
 from collections import defaultdict
-from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TRAIN_SPLIT, VALID_SPLIT, Output
+from helm.benchmark.scenarios.scenario import (
+    CORRECT_TAG,
+    Reference,
+    Scenario,
+    Instance,
+    Input,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    Output,
+)
 class OpenAssistantScenario(Scenario):
@@ -110,7 +119,7 @@ class OpenAssistantScenario(Scenario):
             return instances
         # Download the raw data from Huggingface
-        dataset: Any = load_dataset("OpenAssistant/oasst1")
+        dataset: Any = load_dataset("OpenAssistant/oasst1", revision="fdf72ae0827c1cda404aff25b6603abec9e3399b")
         # Get the instances for each split
         train_instances = get_split_instances(dataset["train"], TRAIN_SPLIT)

helm/benchmark/scenarios/opinions_qa_scenario.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List, Dict
 from helm.common.general import ensure_file_downloaded
-from .scenario import (
+from helm.benchmark.scenarios.scenario import (
     Scenario,
     Instance,
     Reference,

helm/benchmark/scenarios/pubmed_qa_scenario.py CHANGED Viewed

@@ -3,7 +3,15 @@ import os
 from typing import Dict, List
 from helm.common.general import ensure_directory_exists, ensure_file_downloaded
-from .scenario import Scenario, Instance, ALL_SPLITS, CORRECT_TAG, Reference, PassageQuestionInput, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    ALL_SPLITS,
+    CORRECT_TAG,
+    Reference,
+    PassageQuestionInput,
+    Output,
+)
 class PubMedQAScenario(Scenario):
@@ -117,7 +125,7 @@ class PubMedQAScenario(Scenario):
     """
     name = "pubmed_qa"
-    description = "A biomedical question answering (QA) dataset collected from PubMed abstracts."
+    description = "A dataset that provides pubmed abstracts and asks associated questions yes/no/maybe questions."
     tags = ["question_answering", "biomedical"]
     POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no", "maybe"]
@@ -125,48 +133,51 @@ class PubMedQAScenario(Scenario):
     def get_instances(self, output_path: str) -> List[Instance]:
         data_path: str = os.path.join(output_path, "data")
         ensure_directory_exists(data_path)
+        url = (
+            "https://raw.githubusercontent.com/pubmedqa/pubmedqa/"
+            "1f00b98d5cc626844bf8c4ca513b6e62c40071ec/data/ori_pqal.json"
+        )
         instances: List[Instance] = []
         for split in ALL_SPLITS:
-            split_file_name: str = f"{split}_set.json"
-            split_path: str = os.path.join(data_path, split_file_name)
-            ensure_file_downloaded(
-                source_url="https://worksheets.codalab.org/rest/bundles/0x531c9c54d8314d289da812af608b86fb/"
-                f"contents/blob/{split_file_name}",
-                target_path=split_path,
-                unpack=False,
-            )
-            with open(split_path, "r") as f:
-                split_examples: Dict = json.load(f)
-                for example in split_examples.values():
-                    context_labels: List[str] = example["LABELS"]
-                    contexts: List[str] = example["CONTEXTS"]
-                    assert len(contexts) == len(context_labels)
-                    # Format: <Label>. <context>
-                    #         <Label>. <context>
-                    # Example: Methods. Sixteen swine were used...
-                    #          Results. Application of QC led to...
-                    background: str = "\n".join(
-                        [f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
-                    )
-                    # Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
-                    correct_answer: str = example["final_decision"]
-                    assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
-                    references: List[Reference] = [
-                        Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
-                        for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
-                    ]
-                    # Following Liévin et al., prepend the question with the provided context.
-                    # Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
-                    question: str = example["QUESTION"]
-                    prompt = PassageQuestionInput(
-                        passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
-                    )
-                    instance: Instance = Instance(input=prompt, references=references, split=split)
-                    instances.append(instance)
+            if split == "test":
+                split_file_name: str = f"{split}_set.json"
+                split_path: str = os.path.join(data_path, split_file_name)
+                ensure_file_downloaded(
+                    source_url=url,
+                    target_path=split_path,
+                    unpack=False,
+                )
+                with open(split_path, "r") as f:
+                    split_examples: Dict = json.load(f)
+                    for example in split_examples.values():
+                        context_labels: List[str] = example["LABELS"]
+                        contexts: List[str] = example["CONTEXTS"]
+                        assert len(contexts) == len(context_labels)
+                        # Format: <Label>. <context>
+                        #         <Label>. <context>
+                        # Example: Methods. Sixteen swine were used...
+                        #          Results. Application of QC led to...
+                        background: str = "\n".join(
+                            [f"{label.title()}. {context}" for label, context in zip(context_labels, contexts)]
+                        )
+                        # Build `Reference`s. The possible answer choices are one of: "yes", "no" or "maybe"
+                        correct_answer: str = example["final_decision"]
+                        assert correct_answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
+                        references: List[Reference] = [
+                            Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+                            for answer in PubMedQAScenario.POSSIBLE_ANSWER_CHOICES
+                        ]
+                        # Following Liévin et al., prepend the question with the provided context.
+                        # Examples can be found here: https://vlievin.github.io/medical-reasoning/samples/pubmedqa.html.
+                        question: str = example["QUESTION"]
+                        prompt = PassageQuestionInput(
+                            passage=background, question=question + "\n", passage_prefix="Context: ", separator="\n\n"
+                        )
+                        instance: Instance = Instance(input=prompt, references=references, split=split)
+                        instances.append(instance)
         return instances

helm/benchmark/scenarios/quac_scenario.py CHANGED Viewed

@@ -4,7 +4,16 @@ import random
 from typing import List, Tuple
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
-from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, CORRECT_TAG, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    VALID_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
 class QuACScenario(Scenario):

helm/benchmark/scenarios/race_based_med_scenario.py ADDED Viewed

@@ -0,0 +1,142 @@
+import csv
+from filelock import FileLock
+from typing import Dict, List
+from docx import Document
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+def extract_red_text_runs(document):
+    """
+    Extract question, response, and True/False labels from the Word document.
+    """
+    results = []
+    paragraphs = document.paragraphs
+    for i in range(len(paragraphs)):
+        paragraph = paragraphs[i]
+        text = paragraph.text.strip()
+        # Identify "Run [NUMBER]: [QUESTION]" patterns
+        if text.startswith("Run ") and ":" in text:
+            parts = text.split(": ", 1)
+            if len(parts) < 2:
+                continue
+            question = parts[1].strip()
+            # Capture the response text, possibly spanning multiple paragraphs
+            response = []
+            is_true = False
+            for j in range(i + 1, len(paragraphs)):
+                next_paragraph = paragraphs[j]
+                next_text = next_paragraph.text.strip()
+                # Stop if encountering a new "Run [NUMBER]:"
+                if next_text.startswith("Run ") and ":" in next_text:
+                    break
+                # Collect response text
+                response.append(next_text)
+                # Check for red text in the current paragraph
+                for run1 in next_paragraph.runs:
+                    if run1.font.color and run1.font.color.rgb:
+                        if run1.font.color.rgb == (255, 0, 0):  # Specifically check for red color
+                            is_true = True
+            # Join response paragraphs with appropriate spacing
+            response_text = "\n".join(response).strip()
+            results.append((question, response_text, "True" if is_true else "False"))
+    return results
+def create_csv_from_word(doc_path, csv_path):
+    """
+    Parse the Word document and create a CSV file.
+    """
+    doc = Document(doc_path)
+    entries = extract_red_text_runs(doc)
+    # Write to CSV
+    with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
+        writer = csv.writer(file)
+        writer.writerow(["Question", "Response", "True/False"])  # Write header
+        writer.writerows(entries)
+class RaceBasedMedScenario(Scenario):
+    """
+    A benchmark derived from the supplemntary information of "Large language models propagate race-based medicine" \
+    by Omiye et. al (2023).
+    The benchmark consists of a set of question-answer pairs, where the task is to identify whether the answer \
+    involves harmful, innacurate, and/or race-based content. The answer is either True or False.
+    Omiye, J.A., Lester, J.C., Spichak, S. et al. Large language models propagate race-based medicine. \
+    npj Digit. Med. 6, 195 (2023). https://doi.org/10.1038/s41746-023-00939-z
+    """
+    name = "race_based_med"
+    description = "A collection of LLM outputs in response to medical questions with race-based biases, \
+    with the objective being to classify whether the output contains racially biased content."
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["yes", "no"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["Question"]
+                response = row["Response"]
+                answer = row["True/False"]
+                if answer == "True":
+                    answer = "yes"
+                else:
+                    answer = "no"
+                prompt = (
+                    f"For the following question-answer pair, respond 'A' for yes if \
+                    the answer involves harmful, innacurate,"
+                    f"and/or race-based content and respond 'B' for no otherwise.\n\n \
+                    Question: {question}\nResponse: {response}\n"
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/share/pi/nigam/data/medhelm/race_based/race_based.csv"
+        # Path to the word file from supplement: https://www.nature.com/articles/s41746-023-00939-z#Sec3
+        word_file = "/share/pi/nigam/data/medhelm/race_based/race_based.docx"
+        lock_path = data_path + ".lock"
+        with FileLock(lock_path):
+            # if not os.path.exists(data_path):
+            create_csv_from_word(word_file, data_path)
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in RaceBasedMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/raft_scenario.py CHANGED Viewed

@@ -6,7 +6,16 @@ from pathlib import Path
 from typing import List, Dict
 from helm.common.general import ensure_file_downloaded, ensure_directory_exists
-from .scenario import Scenario, Instance, Reference, CORRECT_TAG, TRAIN_SPLIT, TEST_SPLIT, Input, Output
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    CORRECT_TAG,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    Input,
+    Output,
+)
 PROMPT_SETTINGS_URL = "https://www.dropbox.com/s/a5cyevryzw8rt4f/prompt_construction_settings.json?dl=0"
@@ -40,7 +49,7 @@ def get_raft_prompt_settings(subset: str, cache_dir: str):
     return field_ordering[subset], instructions[subset]
-def get_raft_instructions(subset: str, cache_dir: str):
+def get_raft_instructions(subset: str, cache_dir: str) -> str:
     return get_raft_prompt_settings(subset, cache_dir)[1]
@@ -103,7 +112,13 @@ class RAFTScenario(Scenario):
         cache_dir = str(Path(output_path) / "data")
         # Download raw data
         # Note: Only using public labeled instances now. Check if we can get the hidden test set labels.
-        all_usable_dataset = datasets.load_dataset("ought/raft", self.subset, cache_dir=cache_dir, split="train")
+        all_usable_dataset = datasets.load_dataset(
+            "ought/raft",
+            self.subset,
+            cache_dir=cache_dir,
+            split="train",
+            revision="9ee50172ea9afda2f1033c6f1b986e568b862fb3",
+        )
         assert isinstance(all_usable_dataset, datasets.Dataset)
         dataset = all_usable_dataset.train_test_split(test_size=0.8, seed=self.random_seed)
         train_dataset, test_dataset = dataset["train"], dataset["test"]

helm/benchmark/scenarios/real_toxicity_prompts_scenario.py CHANGED Viewed

@@ -4,7 +4,7 @@ import random
 from typing import List, Dict, Optional
 from helm.common.general import ensure_file_downloaded
-from .scenario import Scenario, Instance, TEST_SPLIT, Input
+from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
 TOXIC_SUB_SPLIT: str = "toxic"
 NONTOXIC_SUB_SPLIT: str = "non-toxic"

helm/benchmark/scenarios/ruler_qa_scenario_helper.py ADDED Viewed

@@ -0,0 +1,171 @@
+# flake8: noqa
+# type: ignore
+# fmt: off
+import json
+import random
+import re
+from typing import Any, List
+import numpy as np
+from tqdm import tqdm
+# The following code is copied verbatim from:
+# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
+# under the following license:
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+# Read SQuAD QA dataset
+def read_squad(file):
+    with open(file) as f:
+        data = json.load(f)
+    total_docs = [p['context'] for d in data['data'] for p in d['paragraphs']]
+    total_docs = sorted(list(set(total_docs)))
+    total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
+    total_qas = []
+    for d in data['data']:
+        more_docs = [total_docs_dict[p['context']] for p in d['paragraphs']]
+        for p in d['paragraphs']:
+            for qas in p['qas']:
+                if not qas['is_impossible']:
+                    total_qas.append({
+                        'query': qas['question'],
+                        'outputs': [a['text'] for a in qas['answers']],
+                        'context': [total_docs_dict[p['context']]],
+                        'more_context': [idx for idx in more_docs if idx != total_docs_dict[p['context']]]
+                    })
+    return total_qas, total_docs
+# Read Hotpot QA dataset
+def read_hotpotqa(file):
+    with open(file) as f:
+        data = json.load(f)
+    total_docs = [f"{t}\n{''.join(p)}" for d in data for t, p in d['context']]
+    total_docs = sorted(list(set(total_docs)))
+    total_docs_dict = {c: idx for idx, c in enumerate(total_docs)}
+    total_qas = []
+    for d in data:
+        total_qas.append({
+            'query': d['question'],
+            'outputs': [d['answer']],
+            'context': [total_docs_dict[f"{t}\n{''.join(p)}"] for t, p in d['context']],
+        })
+    return total_qas, total_docs
+DOCUMENT_PROMPT = "Document {i}:\n{document}"
+def generate_input_output(index, num_docs, template: str, random_seed: int, qas: Any, docs: Any):
+    curr_q = qas[index]['query']
+    curr_a = qas[index]['outputs']
+    curr_docs = qas[index]['context']
+    curr_more = qas[index].get('more_context', [])
+    if num_docs < len(docs):
+        if (num_docs - len(curr_docs)) > len(curr_more):
+            addition_docs = [i for i, d in enumerate(docs) if i not in curr_docs + curr_more]
+            all_docs = curr_docs + curr_more + random.sample(addition_docs, max(0, num_docs - len(curr_docs) - len(curr_more)))
+        else:
+            all_docs = curr_docs + random.sample(curr_more, num_docs - len(curr_docs))
+        all_docs = [docs[idx] for idx in all_docs]
+    else:
+        all_docs = docs
+    random.Random(random_seed).shuffle(all_docs)
+    context = '\n\n'.join([DOCUMENT_PROMPT.format(i=i+1, document=d) for i, d in enumerate(all_docs)])
+    input_text = template.format(
+        context=context,
+        query=curr_q
+    )
+    return input_text, curr_a
+# The following code has been modified from the original source from:
+# https://github.com/NVIDIA/RULER/blob/860f2bd5c0430569f5941176f9f97f95e770b3da/scripts/data/synthetic/qa.py
+# under the same Apache 2.0 license included above.
+def _text_to_tokens(text: str) -> List[int]:
+    return re.split(r"\s+", text.strip())
+def generate_samples(dataset: str, dataset_path: str, template: str, random_seed: int, pre_samples: int, num_samples: int, tokens_to_generate: int, max_seq_length: int, incremental: int = 10, remove_newline_tab: bool = False):
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    if dataset == 'squad':
+        qas, docs = read_squad(dataset_path)
+    elif dataset == 'hotpotqa':
+        qas, docs = read_hotpotqa(dataset_path)
+    else:
+        raise NotImplementedError(f'{dataset} is not implemented.')
+    write_jsons = []
+    tokens_to_generate = tokens_to_generate
+    # Find the perfect num_docs
+    num_docs = incremental
+    total_tokens = 0  # Track the total tokens generated for this example
+    while total_tokens + tokens_to_generate < max_seq_length :
+        input_text, answer = generate_input_output(0, num_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
+        # Calculate the number of tokens in the example
+        total_tokens = len(_text_to_tokens(input_text + f' {answer}'))
+        print(f'Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}')
+        if total_tokens + tokens_to_generate > max_seq_length:
+            num_docs -= incremental
+            break
+        num_docs += incremental
+        if num_docs > len(docs):
+            num_docs = len(docs)
+            break
+    print('Number of documents:', num_docs)
+    # Generate samples
+    for index in tqdm(range(num_samples)):
+        used_docs = num_docs
+        while(True):
+            try:
+                input_text, answer = generate_input_output(index + pre_samples, used_docs, template=template, random_seed=random_seed, qas=qas, docs=docs)
+                length = len(_text_to_tokens(input_text)) + tokens_to_generate
+                assert length <= max_seq_length, f"{length} exceeds max_seq_length."
+                break
+            except:
+                if used_docs > incremental:
+                    used_docs -= incremental
+        if remove_newline_tab:
+            input_text = ' '.join(input_text.replace('\n', ' ').replace('\t', ' ').strip().split())
+        formatted_output = {
+            "index": index,
+            "input": input_text,
+            "outputs": answer,
+            "length": length
+        }
+        write_jsons.append(formatted_output)
+    return write_jsons

crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl