crfm-helm 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (60) hide show
  1. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +57 -62
  2. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +53 -55
  3. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
  4. helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
  5. helm/benchmark/annotation/call_center_annotator.py +22 -11
  6. helm/benchmark/annotation/harm_bench_annotator.py +11 -24
  7. helm/benchmark/annotation/live_qa_annotator.py +9 -4
  8. helm/benchmark/annotation/medication_qa_annotator.py +9 -4
  9. helm/benchmark/annotation/model_as_judge.py +70 -19
  10. helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
  11. helm/benchmark/annotation/xstest_annotator.py +20 -30
  12. helm/benchmark/metrics/safety_metrics.py +39 -17
  13. helm/benchmark/metrics/unitxt_metrics.py +17 -3
  14. helm/benchmark/metrics/vision_language/image_metrics.py +6 -2
  15. helm/benchmark/presentation/create_plots.py +1 -1
  16. helm/benchmark/presentation/schema.py +3 -0
  17. helm/benchmark/presentation/summarize.py +106 -256
  18. helm/benchmark/presentation/test_summarize.py +145 -3
  19. helm/benchmark/run_expander.py +27 -0
  20. helm/benchmark/run_specs/bhasa_run_specs.py +27 -13
  21. helm/benchmark/run_specs/finance_run_specs.py +6 -2
  22. helm/benchmark/run_specs/vlm_run_specs.py +8 -3
  23. helm/benchmark/scenarios/bhasa_scenario.py +226 -82
  24. helm/benchmark/scenarios/raft_scenario.py +1 -1
  25. helm/benchmark/static/schema_bhasa.yaml +10 -10
  26. helm/benchmark/static/schema_legal.yaml +566 -0
  27. helm/benchmark/static/schema_safety.yaml +25 -6
  28. helm/benchmark/static/schema_tables.yaml +26 -2
  29. helm/benchmark/static/schema_vhelm.yaml +42 -11
  30. helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
  31. helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
  32. helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
  33. helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
  34. helm/benchmark/static_build/index.html +1 -1
  35. helm/benchmark/window_services/tokenizer_service.py +0 -5
  36. helm/clients/openai_client.py +16 -1
  37. helm/clients/palmyra_client.py +1 -2
  38. helm/clients/together_client.py +22 -0
  39. helm/common/cache.py +8 -30
  40. helm/common/key_value_store.py +9 -9
  41. helm/common/mongo_key_value_store.py +3 -3
  42. helm/common/test_cache.py +1 -48
  43. helm/common/tokenization_request.py +0 -9
  44. helm/config/model_deployments.yaml +135 -3
  45. helm/config/model_metadata.yaml +134 -6
  46. helm/config/tokenizer_configs.yaml +24 -0
  47. helm/proxy/server.py +0 -9
  48. helm/proxy/services/remote_service.py +0 -6
  49. helm/proxy/services/server_service.py +5 -18
  50. helm/proxy/services/service.py +0 -6
  51. helm/benchmark/data_overlap/__init__.py +0 -0
  52. helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
  53. helm/benchmark/data_overlap/export_scenario_text.py +0 -119
  54. helm/benchmark/data_overlap/light_scenario.py +0 -60
  55. helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
  56. helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
  57. helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
  58. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
  59. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
  60. {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
@@ -578,14 +578,18 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
578
578
  scenario_spec=scenario_spec,
579
579
  adapter_spec=adapter_spec,
580
580
  metric_specs=get_exact_match_metric_specs(),
581
- groups=["bhasa_linguistic", f"lindsea_syntax_minimal_pairs_{language}"],
581
+ groups=[
582
+ "bhasa_linguistic",
583
+ f"lindsea_syntax_minimal_pairs_{language}",
584
+ f"lindsea_syntax_minimal_pairs_{method}_{language}",
585
+ ],
582
586
  )
583
587
 
584
588
 
585
- # 2.1. Pragmatics: LINDSEA Pragmatic Reasoning (single sentence)
586
- @run_spec_function("lindsea_pragmatics_pragmatic_reasoning_single")
587
- def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> RunSpec:
588
- name = f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"
589
+ # 2.1. Pragmatics: LINDSEA Presuppositions
590
+ @run_spec_function("lindsea_pragmatics_presuppositions")
591
+ def get_lindsea_pragmatics_presuppositions_spec(language: str = "id", subset: str = "all") -> RunSpec:
592
+ name = f"lindsea_pragmatics_presuppositions_{subset}_{language}"
589
593
 
590
594
  adapter_spec = get_generation_adapter_spec(
591
595
  output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -595,9 +599,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
595
599
  )
596
600
 
597
601
  scenario_spec = ScenarioSpec(
598
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningSingleScenario",
602
+ class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPresuppositionsScenario",
599
603
  args={
600
604
  "language": language,
605
+ "subset": subset,
601
606
  },
602
607
  )
603
608
 
@@ -606,14 +611,18 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
606
611
  scenario_spec=scenario_spec,
607
612
  adapter_spec=adapter_spec,
608
613
  metric_specs=get_exact_match_metric_specs(),
609
- groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_single_{language}"],
614
+ groups=[
615
+ "bhasa_linguistic",
616
+ f"lindsea_pragmatics_presuppositions_{language}",
617
+ f"lindsea_pragmatics_presuppositions_{subset}_{language}",
618
+ ],
610
619
  )
611
620
 
612
621
 
613
- # 2.2. Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
614
- @run_spec_function("lindsea_pragmatics_pragmatic_reasoning_pair")
615
- def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSpec:
616
- name = f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"
622
+ # 2.2. Pragmatics: LINDSEA Scalar Implicatures
623
+ @run_spec_function("lindsea_pragmatics_scalar_implicatures")
624
+ def get_lindsea_pragmatics_scalar_implicatures_spec(language: str = "id", subset: str = "all") -> RunSpec:
625
+ name = f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}"
617
626
 
618
627
  adapter_spec = get_generation_adapter_spec(
619
628
  output_noun=LINDSEA_OUTPUT_NOUNS[language],
@@ -623,9 +632,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
623
632
  )
624
633
 
625
634
  scenario_spec = ScenarioSpec(
626
- class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPragmaticReasoningPairScenario",
635
+ class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsScalarImplicaturesScenario",
627
636
  args={
628
637
  "language": language,
638
+ "subset": subset,
629
639
  },
630
640
  )
631
641
 
@@ -634,5 +644,9 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
634
644
  scenario_spec=scenario_spec,
635
645
  adapter_spec=adapter_spec,
636
646
  metric_specs=get_exact_match_metric_specs(),
637
- groups=["bhasa_linguistic", f"lindsea_pragmatics_pragmatic_reasoning_pair_{language}"],
647
+ groups=[
648
+ "bhasa_linguistic",
649
+ f"lindsea_pragmatics_scalar_implicatures_{language}",
650
+ f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}",
651
+ ],
638
652
  )
@@ -89,10 +89,14 @@ def get_banking77_spec() -> RunSpec:
89
89
 
90
90
  scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.banking77_scenario.Banking77Scenario", args={})
91
91
 
92
- # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77
92
+ # Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77,
93
+ # with a slight modification to the instruction prompt for instruction-following models.
93
94
  scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), Banking77Scenario.name)
95
+ instructions = get_raft_instructions("banking_77", scenario_cache_path).replace(
96
+ "\n", " Answer with only the label for the last query.\n", 1
97
+ )
94
98
  adapter_spec = get_generation_adapter_spec(
95
- instructions=get_raft_instructions("banking_77", scenario_cache_path),
99
+ instructions=instructions,
96
100
  input_noun=None,
97
101
  output_noun="Label",
98
102
  max_tokens=30, # at most ~50 characters per label
@@ -690,13 +690,18 @@ def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
690
690
  + _get_open_ended_generation_metric_specs()
691
691
  )
692
692
 
693
- run_spec_name: str = "bingo"
693
+ group_name: str = "bingo"
694
+ if subject == "Region":
695
+ group_name += "_fairness"
696
+ elif subject == "OCR":
697
+ group_name += "_multilinguality"
698
+
694
699
  return RunSpec(
695
- name=f"{run_spec_name}:subject={subject}",
700
+ name=f"bingo:subject={subject}",
696
701
  scenario_spec=scenario_spec,
697
702
  adapter_spec=adapter_spec,
698
703
  metric_specs=metric_specs,
699
- groups=[run_spec_name],
704
+ groups=[group_name],
700
705
  )
701
706
 
702
707
 
@@ -171,7 +171,7 @@ class XQuADScenario(Scenario):
171
171
  super().__init__()
172
172
  self.language = language
173
173
  self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
174
- self.map = {
174
+ self.language_to_prompt_components = {
175
175
  "th": {
176
176
  "passage_prefix": "ข้อความ: ",
177
177
  "question_prefix": "คำถาม: ",
@@ -183,13 +183,19 @@ class XQuADScenario(Scenario):
183
183
  "random_state": 4502,
184
184
  },
185
185
  }
186
+ if self.language not in self.language_to_prompt_components.keys():
187
+ raise Exception(
188
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
189
+ )
190
+ else:
191
+ self.prompt_components = self.language_to_prompt_components[self.language]
186
192
 
187
193
  def get_instances(self, output_path) -> List[Instance]:
188
194
  dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
189
195
  df = dataset.to_pandas()
190
196
 
191
197
  # Sample 1000 examples for test
192
- df_test = df.sample(n=1000, random_state=self.map[self.language]["random_state"])
198
+ df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
193
199
 
194
200
  # In-context examples to be drawn from remaining examples (since there is no train data)
195
201
  df_train = df[~df.index.isin(df_test.index)]
@@ -210,8 +216,8 @@ class XQuADScenario(Scenario):
210
216
  input = PassageQuestionInput(
211
217
  passage=passage,
212
218
  question=question,
213
- passage_prefix=str(self.map[self.language]["passage_prefix"]),
214
- question_prefix=str(self.map[self.language]["question_prefix"]),
219
+ passage_prefix=str(self.prompt_components["passage_prefix"]),
220
+ question_prefix=str(self.prompt_components["question_prefix"]),
215
221
  )
216
222
  references = []
217
223
  for answer in row["answers"]["text"]:
@@ -1068,6 +1074,9 @@ class FloresScenario(Scenario):
1068
1074
  "ta": "tam_Taml",
1069
1075
  }
1070
1076
 
1077
+ if self.source not in self.languages.keys() or self.target not in self.languages.keys():
1078
+ raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
1079
+
1071
1080
  def get_instances(self, output_path) -> List[Instance]:
1072
1081
  source_dataset = datasets.load_dataset(
1073
1082
  "facebook/flores",
@@ -1259,6 +1268,9 @@ class XNLIScenario(Scenario):
1259
1268
  "test": TEST_SPLIT,
1260
1269
  }
1261
1270
  self.id2label = {0: "A", 2: "B", 1: "C"}
1271
+ self.supported_languages = ["th", "vi"]
1272
+ if self.language not in self.supported_languages:
1273
+ raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
1262
1274
 
1263
1275
  def get_instances(self, output_path) -> List[Instance]:
1264
1276
  dataset = datasets.load_dataset("xnli", self.language)
@@ -1449,7 +1461,7 @@ class XCOPAScenario(Scenario):
1449
1461
  0: "A",
1450
1462
  1: "B",
1451
1463
  }
1452
- self.prompt = {
1464
+ self.language_to_prompt_components = {
1453
1465
  "id": {
1454
1466
  "cause": "sebab",
1455
1467
  "effect": "akibat",
@@ -1476,6 +1488,12 @@ class XCOPAScenario(Scenario):
1476
1488
  "instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
1477
1489
  },
1478
1490
  }
1491
+ if self.language not in self.language_to_prompt_components.keys():
1492
+ raise Exception(
1493
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1494
+ )
1495
+ else:
1496
+ self.prompt_components = self.language_to_prompt_components[self.language]
1479
1497
 
1480
1498
  def get_instances(self, output_path) -> List[Instance]:
1481
1499
  language_dataset = datasets.load_dataset("xcopa", self.language)
@@ -1489,15 +1507,13 @@ class XCOPAScenario(Scenario):
1489
1507
  language_df, tamil_df[["question", "idx"]], on="idx"
1490
1508
  ) # Use the Tamil split's question column
1491
1509
  for _, row in data.iterrows():
1492
- instruction1 = self.prompt[self.language]["instruction1"].format(
1493
- self.prompt[self.language][row["question_y"]]
1494
- )
1510
+ instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
1495
1511
  passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
1496
1512
  premise=row["premise"].strip(),
1497
1513
  instruction1=instruction1,
1498
1514
  choice1=row["choice1"].strip(),
1499
1515
  choice2=row["choice2"].strip(),
1500
- instruction2=self.prompt[self.language]["instruction2"],
1516
+ instruction2=self.prompt_components["instruction2"],
1501
1517
  )
1502
1518
  input = Input(passage)
1503
1519
  output = Output(self.id2label[int(row["label"])])
@@ -1549,18 +1565,24 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1549
1565
 
1550
1566
  name = "lindsea_minimal_pairs"
1551
1567
  description = "LINDSEA minimal pairs task"
1552
- tags = ["minimal_pairs", "linguistic_diagnostic", "syntax"]
1568
+ tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
1553
1569
 
1554
1570
  def __init__(self, method: str, language: str):
1555
1571
  super().__init__()
1556
1572
  self.method = method
1557
1573
  self.language = language
1558
- self.prompts = {
1574
+ self.language_to_prompt_components = {
1559
1575
  "id": {
1560
1576
  "instructions": "Kalimat mana yang lebih mungkin?",
1561
1577
  "output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
1562
1578
  }
1563
1579
  }
1580
+ if self.language not in self.language_to_prompt_components.keys():
1581
+ raise Exception(
1582
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1583
+ )
1584
+ else:
1585
+ self.prompt_components = self.language_to_prompt_components[self.language]
1564
1586
 
1565
1587
  def download_dataset(self, output_path: str):
1566
1588
  BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
@@ -1586,6 +1608,7 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1586
1608
  outputs = []
1587
1609
  if self.method == "mcq":
1588
1610
  category_list = data["category"].value_counts().keys()
1611
+
1589
1612
  hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
1590
1613
  for category in category_list:
1591
1614
  # Fix shuffling within each category
@@ -1594,10 +1617,8 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1594
1617
  options = [(row["correct"], 1), (row["wrong"], 2)]
1595
1618
  random.shuffle(options)
1596
1619
  options_reversed = True if options[0][1] == 2 else False
1597
-
1598
- prompt_components = self.prompts[self.language]
1599
- instructions = prompt_components["instructions"]
1600
- output_prefix = prompt_components["output_prefix"]
1620
+ instructions = self.prompt_components["instructions"]
1621
+ output_prefix = self.prompt_components["output_prefix"]
1601
1622
  prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
1602
1623
  input = Input(text=prompt)
1603
1624
  # Determine correct option based on whether shuffling reversed the options
@@ -1625,23 +1646,31 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
1625
1646
  return outputs
1626
1647
 
1627
1648
 
1628
- # 2. Pragmatics
1629
- # 2.1 LINDSEA Pragmatic Reasoning (single sentence)
1630
- class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
1649
+ # 2.1 Pragmatics: LINDSEA Presuppositions
1650
+ class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
1631
1651
  """
1632
- The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
1652
+ The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
1633
1653
  The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1634
- of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
1654
+ of quality control.
1635
1655
 
1636
- The single-sentence pragmatic reasoning dataset involves questions targeting the truth value of a single sentence.
1637
- The system under test needs to determine if the sentence is true/false or if the proposition is possible/impossible.
1656
+ The presuppositions dataset involves two formats: single and pair sentences.
1657
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
1658
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
1659
+ from another sentence.
1638
1660
 
1639
- The models are prompted using the following general format:
1661
+ For the single format, the models are prompted using the following general format:
1640
1662
 
1641
1663
  Is the following statement true or false?
1642
1664
  Statement: <sentence>
1643
1665
  Answer only with True or False.
1644
1666
 
1667
+ For the pair format, the models are prompted using the following general format:
1668
+
1669
+ Situation: <premise>
1670
+ Given this situation, is the following statement true or false?
1671
+ Statement: <hypothesis>
1672
+ Answer only with True or False.
1673
+
1645
1674
  Target completion:
1646
1675
  <answer>
1647
1676
 
@@ -1661,50 +1690,101 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
1661
1690
  }
1662
1691
  """
1663
1692
 
1664
- name = "lindsea_pragmatic_reasoning_single"
1665
- description = "LINDSEA pragmatic reasoning single sentence task"
1666
- tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
1693
+ name = "lindsea_pragmatics_presuppositions"
1694
+ description = "LINDSEA presuppositions task"
1695
+ tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
1667
1696
 
1668
- def __init__(self, language: str):
1697
+ def __init__(self, language: str, subset: str):
1669
1698
  super().__init__()
1670
1699
  self.language = language
1671
- self.prompt = {
1700
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
1701
+ self.language_to_prompt_components = {
1672
1702
  "id": {
1673
- "question": "Apakah pernyataan berikut ini {}?",
1674
- "instruction": "Jawablah dengan {} saja.",
1703
+ "text_noun": "Pernyataan",
1704
+ "premise_noun": "Situasi",
1705
+ "conclusion_noun": "Pernyataan",
1706
+ "single_question": "Apakah pernyataan berikut ini {}?",
1707
+ "single_instruction": "Jawablah dengan {} saja.",
1708
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1709
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
1710
+ "True": "Benar",
1711
+ "False": "Salah",
1675
1712
  },
1676
1713
  }
1714
+ if self.language not in self.language_to_prompt_components.keys():
1715
+ raise Exception(
1716
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1717
+ )
1718
+ else:
1719
+ self.prompt_components = self.language_to_prompt_components[self.language]
1677
1720
 
1678
1721
  def download_dataset(self, output_path: str):
1679
1722
  BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1680
- URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_single.jsonl"
1681
- file = "pragmatic_reasoning_single"
1682
- target_path_file = os.path.join(output_path, file)
1683
- ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1684
- dataset = pd.read_json(target_path_file, lines=True)
1723
+ datasets = []
1724
+ for subset in self.subsets:
1725
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
1726
+ file = f"pragmatic_reasoning_{subset}.jsonl"
1727
+ target_path_file = os.path.join(output_path, file)
1728
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1729
+ data = pd.read_json(target_path_file, lines=True)
1730
+ data["subset"] = subset
1731
+ data = data[data["linguistic_phenomenon"] == "presuppositions"]
1732
+ datasets.append(data)
1733
+ dataset = pd.concat(datasets)
1685
1734
  return dataset
1686
1735
 
1687
1736
  def get_instances(self, output_path) -> List[Instance]:
1688
1737
  data = self.download_dataset(output_path)
1689
1738
  outputs = []
1690
1739
  for _, row in data.iterrows():
1691
- passage = "{question}\nPernyataan: {text}\n{instruction}".format(
1692
- question=self.prompt[self.language]["question"].format(row["question_translated"]),
1693
- text=row["text"],
1694
- instruction=self.prompt[self.language]["instruction"].format(row["choices_translated"]),
1695
- )
1696
- input = Input(text=passage)
1697
-
1698
- # Split "True or False" into ["True", "or", "False"]
1699
- choices = row["choices"].split()
1700
- choices_translated = row["choices_translated"].split()
1701
- label2choice = {
1702
- choices[0]: choices_translated[0],
1703
- choices[2]: choices_translated[2],
1704
- }
1705
- references = [
1706
- Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1707
- ]
1740
+ passage = None
1741
+ references = []
1742
+
1743
+ if row["subset"] == "single":
1744
+ question = self.prompt_components["single_question"]
1745
+ text_noun = self.prompt_components["text_noun"]
1746
+ instruction = self.prompt_components["single_instruction"]
1747
+
1748
+ passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1749
+ question=question.format(row["question_translated"]),
1750
+ text_noun=text_noun,
1751
+ text=row["text"],
1752
+ instruction=instruction.format(row["choices_translated"]),
1753
+ )
1754
+ # Split "True or False" into ["True", "or", "False"]
1755
+ choices = row["choices"].split()
1756
+ choices_translated = row["choices_translated"].split()
1757
+ label2choice = {
1758
+ choices[0]: choices_translated[0],
1759
+ choices[2]: choices_translated[2],
1760
+ }
1761
+ references.append(
1762
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1763
+ )
1764
+
1765
+ elif row["subset"] == "pair":
1766
+ premise_noun = self.prompt_components["premise_noun"]
1767
+ question = self.prompt_components["pair_question"]
1768
+ conclusion_noun = self.prompt_components["conclusion_noun"]
1769
+ instruction = self.prompt_components["pair_instruction"]
1770
+ label = self.prompt_components[str(row["label"])]
1771
+
1772
+ passage = (
1773
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
1774
+ premise_noun=premise_noun,
1775
+ premise=row["text"],
1776
+ question=question,
1777
+ conclusion_noun=conclusion_noun,
1778
+ conclusion=row["conclusion"],
1779
+ instruction=instruction,
1780
+ )
1781
+ )
1782
+
1783
+ references.append(
1784
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
1785
+ )
1786
+
1787
+ input = Input(text=str(passage))
1708
1788
  instance = Instance(
1709
1789
  input=input,
1710
1790
  references=references,
@@ -1714,17 +1794,25 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
1714
1794
  return outputs
1715
1795
 
1716
1796
 
1717
- # 2.2 Pragmatics: LINDSEA Pragmatic Reasoning (sentence pair)
1718
- class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
1797
+ # 2.2 Pragmatics: LINDSEA Scalar Implicatures
1798
+ class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
1719
1799
  """
1720
- The LINDSEA Pragmatic Reasoning dataset is a linguistic diagnostic scenario targeting pragmatics.
1800
+ The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
1721
1801
  The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
1722
- of quality control. The high-level categories tested for include scalar implicatures and presuppositions.
1802
+ of quality control.
1723
1803
 
1724
- The sentence-pair pragmatic reasoning dataset involves questions targeting whether a conclusion can be drawn
1804
+ The scalar implicatures dataset involves two formats: single and pair sentences.
1805
+ For single sentence questions, the system under test needs to determine if the sentence is true/false.
1806
+ For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
1725
1807
  from another sentence.
1726
1808
 
1727
- The models are prompted using the following general format:
1809
+ For the single format, the models are prompted using the following general format:
1810
+
1811
+ Is the following statement true or false?
1812
+ Statement: <sentence>
1813
+ Answer only with True or False.
1814
+
1815
+ For the pair format, the models are prompted using the following general format:
1728
1816
 
1729
1817
  Situation: <premise>
1730
1818
  Given this situation, is the following statement true or false?
@@ -1750,45 +1838,101 @@ class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
1750
1838
  }
1751
1839
  """
1752
1840
 
1753
- name = "lindsea_pragmatic_reasoning_pair"
1754
- description = "LINDSEA pragmatic reasoning sentence pair task"
1755
- tags = ["pragmatic_reasoning", "linguistic_diagnostic", "pragmatics"]
1841
+ name = "lindsea_pragmatics_scalar_implicatures"
1842
+ description = "LINDSEA scalar implicatures task"
1843
+ tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
1756
1844
 
1757
- def __init__(self, language: str):
1845
+ def __init__(self, language: str, subset: str):
1758
1846
  super().__init__()
1759
1847
  self.language = language
1760
- self.prompt = {
1848
+ self.subsets = [subset] if subset != "all" else ["single", "pair"]
1849
+ self.language_to_prompt_components = {
1761
1850
  "id": {
1762
- "question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1763
- "instruction": "Jawablah dengan Benar atau Salah saja.",
1764
- True: "Benar",
1765
- False: "Salah",
1851
+ "text_noun": "Pernyataan",
1852
+ "premise_noun": "Situasi",
1853
+ "conclusion_noun": "Pernyataan",
1854
+ "single_question": "Apakah pernyataan berikut ini {}?",
1855
+ "single_instruction": "Jawablah dengan {} saja.",
1856
+ "pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
1857
+ "pair_instruction": "Jawablah dengan Benar atau Salah saja.",
1858
+ "True": "Benar",
1859
+ "False": "Salah",
1766
1860
  },
1767
1861
  }
1862
+ if self.language not in self.language_to_prompt_components.keys():
1863
+ raise Exception(
1864
+ f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
1865
+ )
1866
+ else:
1867
+ self.prompt_components = self.language_to_prompt_components[self.language]
1768
1868
 
1769
1869
  def download_dataset(self, output_path: str):
1770
1870
  BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
1771
- URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_pair.jsonl"
1772
- file = "pragmatic_reasoning_pair"
1773
- target_path_file = os.path.join(output_path, file)
1774
- ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1775
- dataset = pd.read_json(target_path_file, lines=True)
1871
+ datasets = []
1872
+ for subset in self.subsets:
1873
+ URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
1874
+ file = f"pragmatic_reasoning_{subset}.jsonl"
1875
+ target_path_file = os.path.join(output_path, file)
1876
+ ensure_file_downloaded(source_url=URL, target_path=target_path_file)
1877
+ data = pd.read_json(target_path_file, lines=True)
1878
+ data["subset"] = subset
1879
+ data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
1880
+ datasets.append(data)
1881
+ dataset = pd.concat(datasets)
1776
1882
  return dataset
1777
1883
 
1778
1884
  def get_instances(self, output_path) -> List[Instance]:
1779
1885
  data = self.download_dataset(output_path)
1780
1886
  outputs = []
1781
1887
  for _, row in data.iterrows():
1782
- passage = "Situasi: {premise}\n{question}\nPernyataan: {conclusion}\n{instruction}".format(
1783
- premise=row["text"],
1784
- question=self.prompt[self.language]["question"],
1785
- conclusion=row["conclusion"],
1786
- instruction=self.prompt[self.language]["instruction"],
1787
- )
1788
- input = Input(text=passage)
1789
- references = [
1790
- Reference(Output(text=self.prompt[self.language][row["label"]]), tags=[CORRECT_TAG]),
1791
- ]
1888
+ passage = None
1889
+ references = []
1890
+
1891
+ if row["subset"] == "single":
1892
+ question = self.prompt_components["single_question"]
1893
+ text_noun = self.prompt_components["text_noun"]
1894
+ instruction = self.prompt_components["single_instruction"]
1895
+
1896
+ passage = "{question}\{text_noun}: {text}\n{instruction}".format(
1897
+ question=question.format(row["question_translated"]),
1898
+ text_noun=text_noun,
1899
+ text=row["text"],
1900
+ instruction=instruction.format(row["choices_translated"]),
1901
+ )
1902
+ # Split "True or False" into ["True", "or", "False"]
1903
+ choices = row["choices"].split()
1904
+ choices_translated = row["choices_translated"].split()
1905
+ label2choice = {
1906
+ choices[0]: choices_translated[0],
1907
+ choices[2]: choices_translated[2],
1908
+ }
1909
+ references.append(
1910
+ Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
1911
+ )
1912
+
1913
+ elif row["subset"] == "pair":
1914
+ premise_noun = self.prompt_components["premise_noun"]
1915
+ question = self.prompt_components["pair_question"]
1916
+ conclusion_noun = self.prompt_components["conclusion_noun"]
1917
+ instruction = self.prompt_components["pair_instruction"]
1918
+ label = self.prompt_components[str(row["label"])]
1919
+
1920
+ passage = (
1921
+ "{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
1922
+ premise_noun=premise_noun,
1923
+ premise=row["text"],
1924
+ question=question,
1925
+ conclusion_noun=conclusion_noun,
1926
+ conclusion=row["conclusion"],
1927
+ instruction=instruction,
1928
+ )
1929
+ )
1930
+
1931
+ references.append(
1932
+ Reference(Output(text=label), tags=[CORRECT_TAG]),
1933
+ )
1934
+
1935
+ input = Input(text=str(passage))
1792
1936
  instance = Instance(
1793
1937
  input=input,
1794
1938
  references=references,
@@ -40,7 +40,7 @@ def get_raft_prompt_settings(subset: str, cache_dir: str):
40
40
  return field_ordering[subset], instructions[subset]
41
41
 
42
42
 
43
- def get_raft_instructions(subset: str, cache_dir: str):
43
+ def get_raft_instructions(subset: str, cache_dir: str) -> str:
44
44
  return get_raft_prompt_settings(subset, cache_dir)[1]
45
45
 
46
46