PyPI - crfm-helm - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

crfm-helm 0.5.3py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (60) hide show

{crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +57 -62
{crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +53 -55
{crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
helm/benchmark/annotation/call_center_annotator.py +22 -11
helm/benchmark/annotation/harm_bench_annotator.py +11 -24
helm/benchmark/annotation/live_qa_annotator.py +9 -4
helm/benchmark/annotation/medication_qa_annotator.py +9 -4
helm/benchmark/annotation/model_as_judge.py +70 -19
helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
helm/benchmark/annotation/xstest_annotator.py +20 -30
helm/benchmark/metrics/safety_metrics.py +39 -17
helm/benchmark/metrics/unitxt_metrics.py +17 -3
helm/benchmark/metrics/vision_language/image_metrics.py +6 -2
helm/benchmark/presentation/create_plots.py +1 -1
helm/benchmark/presentation/schema.py +3 -0
helm/benchmark/presentation/summarize.py +106 -256
helm/benchmark/presentation/test_summarize.py +145 -3
helm/benchmark/run_expander.py +27 -0
helm/benchmark/run_specs/bhasa_run_specs.py +27 -13
helm/benchmark/run_specs/finance_run_specs.py +6 -2
helm/benchmark/run_specs/vlm_run_specs.py +8 -3
helm/benchmark/scenarios/bhasa_scenario.py +226 -82
helm/benchmark/scenarios/raft_scenario.py +1 -1
helm/benchmark/static/schema_bhasa.yaml +10 -10
helm/benchmark/static/schema_legal.yaml +566 -0
helm/benchmark/static/schema_safety.yaml +25 -6
helm/benchmark/static/schema_tables.yaml +26 -2
helm/benchmark/static/schema_vhelm.yaml +42 -11
helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
helm/benchmark/static_build/index.html +1 -1
helm/benchmark/window_services/tokenizer_service.py +0 -5
helm/clients/openai_client.py +16 -1
helm/clients/palmyra_client.py +1 -2
helm/clients/together_client.py +22 -0
helm/common/cache.py +8 -30
helm/common/key_value_store.py +9 -9
helm/common/mongo_key_value_store.py +3 -3
helm/common/test_cache.py +1 -48
helm/common/tokenization_request.py +0 -9
helm/config/model_deployments.yaml +135 -3
helm/config/model_metadata.yaml +134 -6
helm/config/tokenizer_configs.yaml +24 -0
helm/proxy/server.py +0 -9
helm/proxy/services/remote_service.py +0 -6
helm/proxy/services/server_service.py +5 -18
helm/proxy/services/service.py +0 -6
helm/benchmark/data_overlap/__init__.py +0 -0
helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
helm/benchmark/data_overlap/export_scenario_text.py +0 -119
helm/benchmark/data_overlap/light_scenario.py +0 -60
helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
{crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
{crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0

helm/benchmark/run_specs/bhasa_run_specs.py CHANGED Viewed

@@ -578,14 +578,18 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["bhasa_linguistic", f"lindsea_syntax_minimal_pairs_{language}"],
+        groups=[
+            "bhasa_linguistic",
+            f"lindsea_syntax_minimal_pairs_{language}",
+            f"lindsea_syntax_minimal_pairs_{method}_{language}",
+        ],
     )
-# 2.1. Pragmatics: LINDSEA Pragmatic Reasoning (single sentence)
-@run_spec_function("lindsea_pragmatics_pragmatic_reasoning_single")
-def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> RunSpec:
-    name = f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"
+# 2.1. Pragmatics: LINDSEA Presuppositions
+@run_spec_function("lindsea_pragmatics_presuppositions")
+def get_lindsea_pragmatics_presuppositions_spec(language: str = "id", subset: str = "all") -> RunSpec:
+    name = f"lindsea_pragmatics_presuppositions_{subset}_{language}"
     adapter_spec = get_generation_adapter_spec(
         output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -595,9 +599,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningSingleScenario",
+        class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPresuppositionsScenario",
         args={
             "language": language,
+            "subset": subset,
         },
     )
@@ -606,14 +611,18 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"],
+        groups=[
+            "bhasa_linguistic",
+            f"lindsea_pragmatics_presuppositions_{language}",
+            f"lindsea_pragmatics_presuppositions_{subset}_{language}",
+        ],
     )
-# 2.2. Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
-@run_spec_function("lindsea_pragmatics_pragmatic_reasoning_pair")
-def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSpec:
-    name = f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"
+# 2.2. Pragmatics: LINDSEA Scalar Implicatures
+@run_spec_function("lindsea_pragmatics_scalar_implicatures")
+def get_lindsea_pragmatics_scalar_implicatures_spec(language: str = "id", subset: str = "all") -> RunSpec:
+    name = f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}"
     adapter_spec = get_generation_adapter_spec(
         output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -623,9 +632,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
     )
     scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningPairScenario",
+        class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsScalarImplicaturesScenario",
         args={
             "language": language,
+            "subset": subset,
         },
     )
@@ -634,5 +644,9 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=get_exact_match_metric_specs(),
-        groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"],
+        groups=[
+            "bhasa_linguistic",
+            f"lindsea_pragmatics_scalar_implicatures_{language}",
+            f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}",
+        ],
     )

helm/benchmark/run_specs/finance_run_specs.py CHANGED Viewed

@@ -89,10 +89,14 @@ def get_banking77_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.banking77_scenario.Banking77Scenario", args={})
-    # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77
+    # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77,
+    # with a slight modification to the instruction prompt for instruction-following models.
     scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), Banking77Scenario.name)
+    instructions = get_raft_instructions("banking_77", scenario_cache_path).replace(
+        "\n", " Answer with only the label for the last query.\n", 1
+    )
     adapter_spec = get_generation_adapter_spec(
-        instructions=get_raft_instructions("banking_77", scenario_cache_path),
+        instructions=instructions,
         input_noun=None,
         output_noun="Label",
         max_tokens=30,  # at most ~50 characters per label

helm/benchmark/run_specs/vlm_run_specs.py CHANGED Viewed

@@ -690,13 +690,18 @@ def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
         + _get_open_ended_generation_metric_specs()
     )
-    run_spec_name: str = "bingo"
+    group_name: str = "bingo"
+    if subject == "Region":
+        group_name += "_fairness"
+    elif subject == "OCR":
+        group_name += "_multilinguality"
     return RunSpec(
-        name=f"{run_spec_name}:subject={subject}",
+        name=f"bingo:subject={subject}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
-        groups=[run_spec_name],
+        groups=[group_name],
     )

helm/benchmark/scenarios/bhasa_scenario.py CHANGED Viewed

@@ -171,7 +171,7 @@ class XQuADScenario(Scenario):
         super().__init__()
         self.language = language
         self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
-        self.map = {
+        self.language_to_prompt_components = {
             "th": {
                 "passage_prefix": "ข้อความ: ",
                 "question_prefix": "คำถาม: ",
@@ -183,13 +183,19 @@ class XQuADScenario(Scenario):
                 "random_state": 4502,
             },
         }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
     def get_instances(self, output_path) -> List[Instance]:
         dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
         df = dataset.to_pandas()
         # Sample 1000 examples for test
-        df_test = df.sample(n=1000, random_state=self.map[self.language]["random_state"])
+        df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
         # In-context examples to be drawn from remaining examples (since there is no train data)
         df_train = df[~df.index.isin(df_test.index)]
@@ -210,8 +216,8 @@ class XQuADScenario(Scenario):
                 input = PassageQuestionInput(
                     passage=passage,
                     question=question,
-                    passage_prefix=str(self.map[self.language]["passage_prefix"]),
-                    question_prefix=str(self.map[self.language]["question_prefix"]),
+                    passage_prefix=str(self.prompt_components["passage_prefix"]),
+                    question_prefix=str(self.prompt_components["question_prefix"]),
                 )
                 references = []
                 for answer in row["answers"]["text"]:
@@ -1068,6 +1074,9 @@ class FloresScenario(Scenario):
             "ta": "tam_Taml",
         }
+        if self.source not in self.languages.keys() or self.target not in self.languages.keys():
+            raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
     def get_instances(self, output_path) -> List[Instance]:
         source_dataset = datasets.load_dataset(
             "facebook/flores",
@@ -1259,6 +1268,9 @@ class XNLIScenario(Scenario):
             "test": TEST_SPLIT,
         }
         self.id2label = {0: "A", 2: "B", 1: "C"}
+        self.supported_languages = ["th", "vi"]
+        if self.language not in self.supported_languages:
+            raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
     def get_instances(self, output_path) -> List[Instance]:
         dataset = datasets.load_dataset("xnli", self.language)
@@ -1449,7 +1461,7 @@ class XCOPAScenario(Scenario):
             0: "A",
             1: "B",
         }
-        self.prompt = {
+        self.language_to_prompt_components = {
             "id": {
                 "cause": "sebab",
                 "effect": "akibat",
@@ -1476,6 +1488,12 @@ class XCOPAScenario(Scenario):
                 "instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
             },
         }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
     def get_instances(self, output_path) -> List[Instance]:
         language_dataset = datasets.load_dataset("xcopa", self.language)
@@ -1489,15 +1507,13 @@ class XCOPAScenario(Scenario):
                 language_df, tamil_df[["question", "idx"]], on="idx"
             )  # Use the Tamil split's question column
             for _, row in data.iterrows():
-                instruction1 = self.prompt[self.language]["instruction1"].format(
-                    self.prompt[self.language][row["question_y"]]
-                )
+                instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
                 passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
                     premise=row["premise"].strip(),
                     instruction1=instruction1,
                     choice1=row["choice1"].strip(),
                     choice2=row["choice2"].strip(),
-                    instruction2=self.prompt[self.language]["instruction2"],
+                    instruction2=self.prompt_components["instruction2"],
                 )
                 input = Input(passage)
                 output = Output(self.id2label[int(row["label"])])
@@ -1549,18 +1565,24 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
     name = "lindsea_minimal_pairs"
     description = "LINDSEA minimal pairs task"
-    tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"]
+    tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
     def __init__(self, method: str, language: str):
         super().__init__()
         self.method = method
         self.language = language
-        self.prompts = {
+        self.language_to_prompt_components = {
             "id": {
                 "instructions": "Kalimat mana yang lebih mungkin?",
                 "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
             }
         }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
     def download_dataset(self, output_path: str):
         BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
@@ -1586,6 +1608,7 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
         outputs = []
         if self.method == "mcq":
             category_list = data["category"].value_counts().keys()
             hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
             for category in category_list:
                 # Fix shuffling within each category
@@ -1594,10 +1617,8 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
                     options = [(row["correct"], 1), (row["wrong"], 2)]
                     random.shuffle(options)
                     options_reversed = True if options[0][1] == 2 else False
-                    prompt_components = self.prompts[self.language]
-                    instructions = prompt_components["instructions"]
-                    output_prefix = prompt_components["output_prefix"]
+                    instructions = self.prompt_components["instructions"]
+                    output_prefix = self.prompt_components["output_prefix"]
                     prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
                     input = Input(text=prompt)
                     # Determine correct option based on whether shuffling reversed the options
@@ -1625,23 +1646,31 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
         return outputs
-# 2. Pragmatics
-# 2.1 LINDSEA Pragmatic Reasoning (single sentence)
-class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
+# 2.1 Pragmatics: LINDSEA Presuppositions
+class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
     """
-    The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
+    The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
     The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
-    of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
+    of quality control.
-    The single-sentence pragmatic reasoning dataset involves questions targeting the truth value of a single sentence.
-    The system under test needs to determine if the sentence is true/false or if the proposition is possible/impossible.
+    The presuppositions dataset involves two formats: single and pair sentences.
+    For single sentence questions, the system under test needs to determine if the sentence is true/false.
+    For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
+    from another sentence.
-    The models are prompted using the following general format:
+    For the single format, the models are prompted using the following general format:
         Is the following statement true or false?
         Statement: <sentence>
         Answer only with True or False.
+    For the pair format, the models are prompted using the following general format:
+        Situation: <premise>
+        Given this situation, is the following statement true or false?
+        Statement: <hypothesis>
+        Answer only with True or False.
     Target completion:
         <answer>
@@ -1661,50 +1690,101 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
     }
     """
-    name = "lindsea_pragmatic_reasoning_single"
-    description = "LINDSEA pragmatic reasoning single sentence task"
-    tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
+    name = "lindsea_pragmatics_presuppositions"
+    description = "LINDSEA presuppositions task"
+    tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
-    def __init__(self, language: str):
+    def __init__(self, language: str, subset: str):
         super().__init__()
         self.language = language
-        self.prompt = {
+        self.subsets = [subset] if subset != "all" else ["single", "pair"]
+        self.language_to_prompt_components = {
             "id": {
-                "question": "Apakah pernyataan berikut ini {}?",
-                "instruction": "Jawablah dengan {} saja.",
+                "text_noun": "Pernyataan",
+                "premise_noun": "Situasi",
+                "conclusion_noun": "Pernyataan",
+                "single_question": "Apakah pernyataan berikut ini {}?",
+                "single_instruction": "Jawablah dengan {} saja.",
+                "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
+                "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
+                "True": "Benar",
+                "False": "Salah",
             },
         }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
     def download_dataset(self, output_path: str):
         BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
-        URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_single.jsonl"
-        file = "pragmatic_reasoning_single"
-        target_path_file = os.path.join(output_path, file)
-        ensure_file_downloaded(source_url=URL, target_path=target_path_file)
-        dataset = pd.read_json(target_path_file, lines=True)
+        datasets = []
+        for subset in self.subsets:
+            URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
+            file = f"pragmatic_reasoning_{subset}.jsonl"
+            target_path_file = os.path.join(output_path, file)
+            ensure_file_downloaded(source_url=URL, target_path=target_path_file)
+            data = pd.read_json(target_path_file, lines=True)
+            data["subset"] = subset
+            data = data[data["linguistic_phenomenon"] == "presuppositions"]
+            datasets.append(data)
+        dataset = pd.concat(datasets)
         return dataset
     def get_instances(self, output_path) -> List[Instance]:
         data = self.download_dataset(output_path)
         outputs = []
         for _, row in data.iterrows():
-            passage = "{question}\nPernyataan: {text}\n{instruction}".format(
-                question=self.prompt[self.language]["question"].format(row["question_translated"]),
-                text=row["text"],
-                instruction=self.prompt[self.language]["instruction"].format(row["choices_translated"]),
-            )
-            input = Input(text=passage)
-            # Split "True or False" into ["True", "or", "False"]
-            choices = row["choices"].split()
-            choices_translated = row["choices_translated"].split()
-            label2choice = {
-                choices[0]: choices_translated[0],
-                choices[2]: choices_translated[2],
-            }
-            references = [
-                Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
-            ]
+            passage = None
+            references = []
+            if row["subset"] == "single":
+                question = self.prompt_components["single_question"]
+                text_noun = self.prompt_components["text_noun"]
+                instruction = self.prompt_components["single_instruction"]
+                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                    question=question.format(row["question_translated"]),
+                    text_noun=text_noun,
+                    text=row["text"],
+                    instruction=instruction.format(row["choices_translated"]),
+                )
+                # Split "True or False" into ["True", "or", "False"]
+                choices = row["choices"].split()
+                choices_translated = row["choices_translated"].split()
+                label2choice = {
+                    choices[0]: choices_translated[0],
+                    choices[2]: choices_translated[2],
+                }
+                references.append(
+                    Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
+                )
+            elif row["subset"] == "pair":
+                premise_noun = self.prompt_components["premise_noun"]
+                question = self.prompt_components["pair_question"]
+                conclusion_noun = self.prompt_components["conclusion_noun"]
+                instruction = self.prompt_components["pair_instruction"]
+                label = self.prompt_components[str(row["label"])]
+                passage = (
+                    "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
+                        premise_noun=premise_noun,
+                        premise=row["text"],
+                        question=question,
+                        conclusion_noun=conclusion_noun,
+                        conclusion=row["conclusion"],
+                        instruction=instruction,
+                    )
+                )
+                references.append(
+                    Reference(Output(text=label), tags=[CORRECT_TAG]),
+                )
+            input = Input(text=str(passage))
             instance = Instance(
                 input=input,
                 references=references,
@@ -1714,17 +1794,25 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
         return outputs
-# 2.2 Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
-class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
+# 2.2 Pragmatics: LINDSEA Scalar Implicatures
+class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
     """
-    The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
+    The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
     The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
-    of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
+    of quality control.
-    The sentence-pair pragmatic reasoning dataset involves questions targeting whether a conclusion can be drawn
+    The scalar implicatures dataset involves two formats: single and pair sentences.
+    For single sentence questions, the system under test needs to determine if the sentence is true/false.
+    For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
     from another sentence.
-    The models are prompted using the following general format:
+    For the single format, the models are prompted using the following general format:
+        Is the following statement true or false?
+        Statement: <sentence>
+        Answer only with True or False.
+    For the pair format, the models are prompted using the following general format:
         Situation: <premise>
         Given this situation, is the following statement true or false?
@@ -1750,45 +1838,101 @@ class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
     }
     """
-    name = "lindsea_pragmatic_reasoning_pair"
-    description = "LINDSEA pragmatic reasoning sentence pair task"
-    tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
+    name = "lindsea_pragmatics_scalar_implicatures"
+    description = "LINDSEA scalar implicatures task"
+    tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
-    def __init__(self, language: str):
+    def __init__(self, language: str, subset: str):
         super().__init__()
         self.language = language
-        self.prompt = {
+        self.subsets = [subset] if subset != "all" else ["single", "pair"]
+        self.language_to_prompt_components = {
             "id": {
-                "question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
-                "instruction": "Jawablah dengan Benar atau Salah saja.",
-                True: "Benar",
-                False: "Salah",
+                "text_noun": "Pernyataan",
+                "premise_noun": "Situasi",
+                "conclusion_noun": "Pernyataan",
+                "single_question": "Apakah pernyataan berikut ini {}?",
+                "single_instruction": "Jawablah dengan {} saja.",
+                "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
+                "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
+                "True": "Benar",
+                "False": "Salah",
             },
         }
+        if self.language not in self.language_to_prompt_components.keys():
+            raise Exception(
+                f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
+            )
+        else:
+            self.prompt_components = self.language_to_prompt_components[self.language]
     def download_dataset(self, output_path: str):
         BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
-        URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_pair.jsonl"
-        file = "pragmatic_reasoning_pair"
-        target_path_file = os.path.join(output_path, file)
-        ensure_file_downloaded(source_url=URL, target_path=target_path_file)
-        dataset = pd.read_json(target_path_file, lines=True)
+        datasets = []
+        for subset in self.subsets:
+            URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
+            file = f"pragmatic_reasoning_{subset}.jsonl"
+            target_path_file = os.path.join(output_path, file)
+            ensure_file_downloaded(source_url=URL, target_path=target_path_file)
+            data = pd.read_json(target_path_file, lines=True)
+            data["subset"] = subset
+            data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
+            datasets.append(data)
+        dataset = pd.concat(datasets)
         return dataset
     def get_instances(self, output_path) -> List[Instance]:
         data = self.download_dataset(output_path)
         outputs = []
         for _, row in data.iterrows():
-            passage = "Situasi: {premise}\n{question}\nPernyataan: {conclusion}\n{instruction}".format(
-                premise=row["text"],
-                question=self.prompt[self.language]["question"],
-                conclusion=row["conclusion"],
-                instruction=self.prompt[self.language]["instruction"],
-            )
-            input = Input(text=passage)
-            references = [
-                Reference(Output(text=self.prompt[self.language][row["label"]]), tags=[CORRECT_TAG]),
-            ]
+            passage = None
+            references = []
+            if row["subset"] == "single":
+                question = self.prompt_components["single_question"]
+                text_noun = self.prompt_components["text_noun"]
+                instruction = self.prompt_components["single_instruction"]
+                passage = "{question}\{text_noun}: {text}\n{instruction}".format(
+                    question=question.format(row["question_translated"]),
+                    text_noun=text_noun,
+                    text=row["text"],
+                    instruction=instruction.format(row["choices_translated"]),
+                )
+                # Split "True or False" into ["True", "or", "False"]
+                choices = row["choices"].split()
+                choices_translated = row["choices_translated"].split()
+                label2choice = {
+                    choices[0]: choices_translated[0],
+                    choices[2]: choices_translated[2],
+                }
+                references.append(
+                    Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
+                )
+            elif row["subset"] == "pair":
+                premise_noun = self.prompt_components["premise_noun"]
+                question = self.prompt_components["pair_question"]
+                conclusion_noun = self.prompt_components["conclusion_noun"]
+                instruction = self.prompt_components["pair_instruction"]
+                label = self.prompt_components[str(row["label"])]
+                passage = (
+                    "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
+                        premise_noun=premise_noun,
+                        premise=row["text"],
+                        question=question,
+                        conclusion_noun=conclusion_noun,
+                        conclusion=row["conclusion"],
+                        instruction=instruction,
+                    )
+                )
+                references.append(
+                    Reference(Output(text=label), tags=[CORRECT_TAG]),
+                )
+            input = Input(text=str(passage))
             instance = Instance(
                 input=input,
                 references=references,

helm/benchmark/scenarios/raft_scenario.py CHANGED Viewed

@@ -40,7 +40,7 @@ def get_raft_prompt_settings(subset: str, cache_dir: str):
     return field_ordering[subset], instructions[subset]
-def get_raft_instructions(subset: str, cache_dir: str):
+def get_raft_instructions(subset: str, cache_dir: str) -> str:
     return get_raft_prompt_settings(subset, cache_dir)[1]

crfm-helm 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.3py3-none-any.whl → 0.5.4py3-none-any.whl