PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (580) hide show

helm/benchmark/scenarios/scenario.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field, replace
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Any
 import os
 from pathlib import PurePath
 import inspect
@@ -67,6 +67,11 @@ class Input:
     multimedia_content: Optional[MultimediaObject] = None
     """A single input can consists of multimodal content interleaved (e.g., text, image, text, ...)."""
+    messages: Optional[List[Dict[str, str]]] = None
+    """Used for chat models.
+    If messages is specified for a chat model, the prompt is ignored.
+    Otherwise, the client should convert the prompt into a message."""
 @dataclass(frozen=True)
 class PassageQuestionInput(Input):
@@ -153,6 +158,9 @@ class Instance:
     contrast_references: Optional[List[List[Reference]]] = None
     """References for the perturbed input above (if available)"""
+    extra_data: Optional[Dict[str, Any]] = None
+    """Extra data required by the scenario e.g. chain-of-thought annotations"""
     @property
     def first_correct_reference(self) -> Optional[Reference]:
         """Return the first correct reference."""

helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} RENAMED Viewed

@@ -19,7 +19,7 @@ from helm.benchmark.scenarios.scenario import (
 from helm.common.general import ensure_file_downloaded
 from helm.common.hierarchical_logger import hlog
-# BHASA Scenarios
+# SEA-HELM Scenarios
 #   A. Natural Language Understanding
 #   B. Natural Language Generation
 #   C. Natural Language Reasoning
@@ -95,7 +95,12 @@ class TyDiQAScenario(Scenario):
         self.splits = {"train": TRAIN_SPLIT, "validation": TEST_SPLIT}
     def get_instances(self, output_path) -> List[Instance]:
-        dataset = datasets.load_dataset("khalidalt/tydiqa-goldp", "indonesian")
+        dataset = datasets.load_dataset(
+            "khalidalt/tydiqa-goldp",
+            "indonesian",
+            revision="7d69b53c9c8187ae7e21d8441362efa1a7e3013d",
+            trust_remote_code=True,
+        )
         outputs = []
         for split in self.splits.keys():

helm/benchmark/scenarios/self_instruct_scenario.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from typing import List
 from helm.common.general import ensure_file_downloaded
-from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
+from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
 class SelfInstructScenario(Scenario):

helm/benchmark/scenarios/shc_bmt_scenario.py ADDED Viewed

@@ -0,0 +1,69 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCBMTMedScenario(Scenario):
+    """
+    This benchmark dataset was built from a patient status gold-standard
+    for specific questions asked after a bone marrow transplant has taken place.
+    """
+    name = "shc_bmt_med"
+    description = (
+        "A dataset containing patient notes with associated "
+        "questions and answers related to bone marrow transplantation."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    f"Provide an answer to the following {question} with the following context: {context} "
+                    ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
+                    "details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-BMT-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCBMTMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_cdi_scenario.py ADDED Viewed

@@ -0,0 +1,70 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCCDIMedScenario(Scenario):
+    """
+    This benchmark dataset was built from Clinical Document Integrity (CDI)
+    notes were there are verifications of clinical activities. The idea behind
+    it was to assess an LLM capability to answer these questions from previous notes.
+    """
+    name = "shc_cdi_med"
+    description = (
+        "A dataset built from Clinical Document Integrity (CDI) notes, to assess "
+        "the ability to answer verification questions from previous notes."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    f"Provide an answer to the following {question} with the following context: {context} , "
+                    "Answer the question with either 'A' for yes or 'B' for no. Do not provide any "
+                    "additional details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CDI-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCCDIMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_conf_scenario.py ADDED Viewed

@@ -0,0 +1,70 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCCONFMedScenario(Scenario):
+    """
+    Benchmark derived from extracting confidential information from clinical notes.
+    From Evaluation of a Large Language Model to Identify Confidential Content in
+    Adolescent Encounter Notes published at https://jamanetwork.com/journals/jamapediatrics/fullarticle/2814109
+    """
+    name = "shc_conf_med"
+    description = (
+        "A dataset of clinical notes from adolescent patients used to identify sensitive "
+        "protected health information that should be restricted from parental access."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    f"Provide an answer to the following {question} with the following context: {context} "
+                    ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
+                    "details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-CONF-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCCONFMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_ent_scenario.py ADDED Viewed

@@ -0,0 +1,72 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCENTMedScenario(Scenario):
+    """
+    This benchmark dataset was built to assess the capabilities "
+    "of an LLM for referral to the Ear, Nose and Throat department.
+    """
+    name = "shc_ent_med"
+    description = (
+        "A dataset designed to evaluate performance in "
+        "identifying appropriate patient referrals to Ear, Nose, and Throat specialists."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B", "C"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        counter = 1
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                if row["label"] != "":  # skip rows with character/encoding issues - 79
+                    question = row["prompt"]
+                    context = row["context"]
+                    answer = row["label"]
+                    prompt = (
+                        f"{counter} Provide an answer to the following {question} with the following context:"
+                        f" {context} , Answer the question with either 'A' for yes, 'B' for no, or 'C' for no mention."
+                        " Do not provide any additional details or response, just a simple A, B, or C response."
+                    )
+                    data[prompt] = answer
+                    counter = counter + 1
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-ENT-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCENTMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_gip_scenario.py ADDED Viewed

@@ -0,0 +1,66 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCGIPMedScenario(Scenario):
+    """
+    This benchmark dataset was built from a patient referral gold-standard set
+    to a specialty clinic to verify the ability of LLMs for patient hospice referral purposes.
+    """
+    name = "shc_gip_med"
+    description = "A dataset evaluating performance in identifying appropriate patient referrals to hospice care."
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    f"Provide an answer to the following {question} with the following context: {context} "
+                    ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
+                    "details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-GIP-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCGIPMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_ptbm_scenario.py ADDED Viewed

@@ -0,0 +1,76 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCPTBMMedScenario(Scenario):
+    """
+    This dataset contains clinical notes from primary care visit encounters of
+    children ages 4-6 years old with ADHD seen at Stanford's community-based primary
+    care network, Packard Children's Health Alliance, between 2015-2019. In this classification
+    task, the LLM is tasked with classifying whether the note contains clinician recommendation
+    for parent training in behavior management, which is the first-line evidence-based treatment
+    for young children with ADHD. From publication: https://doi.org/10.1093/jamia/ocae001
+    """
+    name = "shc_ptbm_med"
+    description = (
+        "A dataset that classifies whether a clinical note contains a clinician "
+        "recommendation for parent training in behavior management, which is the first-line "
+        "evidence-based treatment for young children with ADHD."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    "You are reviewing a clinical note from health records of children with "
+                    "attention deficit hyperactivity disorder (ADHD) and classifying mentions of "
+                    f"behavioral therapy. Provide an answer to the following {question} with the "
+                    f"following context: {context} , Answer the question with a 'A' for yes or 'B' "
+                    "for no. Do not provide any additional details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-PTBM-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCPTBMMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_sei_scenario.py ADDED Viewed

@@ -0,0 +1,89 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCSEIMedScenario(Scenario):
+    """
+    This dataset contains clinical notes from primary care visit encounters
+    (in-person/telehealth and telephone) of children ages 6-11 years old with ADHD
+    seen at Stanford's community-based primary care network, Packard Children's Health Alliance,
+    between 2015-2022. All children in this dataset were prescribed at least once an ADHD
+    medication (stimulants or non-stimulants) by a primary care clinician. In this
+    classification task, the LLM is tasked with classifying whether the note contains
+    documentation of side effect monitoring (recording of absence or presence of
+    medication side effects), as recommended in clinical practice guidelines.
+    From publication: https://doi.org/10.1542/peds.2024-067223
+    """
+    name = "shc_sei_med"
+    description = (
+        "A dataset that classifies whether a clinical note contains documentation "
+        "of side effect monitoring (recording of absence or presence of medication "
+        "side effects), as recommended in clinical practice guidelines."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                question = row["prompt"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    "You are reviewing a clinical note from health records of children "
+                    "with attention deficit hyperactivity disorder (ADHD). Given the following "
+                    "definitions: side Effects Inquiry (SEI): Explicit documentation by the clinician "
+                    "asking about current side effects related to ADHD medications that the child is "
+                    "taking or documentation of specific ADHD medication side effects experienced "
+                    "by the patient. SEI does *not* include future side effects monitoring, "
+                    "such as documentation of potential ADHD medication side effects, including "
+                    "planning to follow patients to monitor side effects, explaining about "
+                    "potential side effects of an ADHD medication. These documentations are not "
+                    "categorized as SEI because they consist of a plan or an explanation about "
+                    "side effects without actual side effect monitoring taking place, and "
+                    "No Side Effects Inquiry (NSEI): No evidence of side effects monitoring. "
+                    f"Provide an answer to the following {question} with the following context: {context} "
+                    ", Answer the question with a 'A' for yes or 'B' for no. Do not provide any additional "
+                    "details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-SEI-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCSEIMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/shc_sequoia_scenario.py ADDED Viewed

@@ -0,0 +1,69 @@
+import sys
+import csv
+from typing import Dict, List
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+csv.field_size_limit(sys.maxsize)
+class SHCSequoiaMedScenario(Scenario):
+    """
+    Benchmark derived from manually curated answers to several questions for Sequoia clinic referrals
+    """
+    name = "shc_sequoia_med"
+    description = (
+        "A dataset containing manually curated answers to questions regarding patient referrals to the Sequoia clinic."
+    )
+    tags = ["knowledge", "reasoning", "biomedical"]
+    POSSIBLE_ANSWER_CHOICES: List[str] = ["A", "B"]
+    def create_benchmark(self, csv_path) -> Dict[str, str]:
+        data = {}
+        counter = 1
+        with open(csv_path, "r") as file:
+            reader = csv.DictReader(file)  # , quoting=csv.QUOTE_MINIMAL
+            for row in reader:
+                question = row["question"]
+                context = row["context"]
+                answer = row["label"]
+                prompt = (
+                    f" {counter} Provide an answer to the following {question} with the following context:"
+                    f" {context} , Answer the question with a 'A' for yes or 'B' for no. Do not provide any "
+                    "additional details or response, just a simple A or B response."
+                )
+                data[prompt] = answer
+                counter += 1
+        return data
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = "/dbfs/mnt/azure_adbfs/Files/medhelm/medhelm-sequoia-dataset_filtered.csv"
+        instances: List[Instance] = []
+        benchmark_data = self.create_benchmark(data_path)
+        for prompt, answer in benchmark_data.items():
+            assert answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
+            references: List[Reference] = [
+                Reference(Output(text=pred_answer), tags=[CORRECT_TAG] if pred_answer == answer else [])
+                for pred_answer in SHCSequoiaMedScenario.POSSIBLE_ANSWER_CHOICES
+            ]
+            instances.append(
+                Instance(
+                    input=Input(text=prompt),
+                    references=references,  # [Reference(Output(text=answer), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+            )
+        return instances

helm/benchmark/scenarios/simple_safety_tests_scenario.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import List
 from datasets import load_dataset
-from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
+from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
 class SimpleSafetyTestsScenario(Scenario):

crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.5py3-none-any.whl