crfm-helm 0.5.3__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/METADATA +57 -62
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/RECORD +53 -55
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/WHEEL +1 -1
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/live_qa_annotator.py +9 -4
- helm/benchmark/annotation/medication_qa_annotator.py +9 -4
- helm/benchmark/annotation/model_as_judge.py +70 -19
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/unitxt_metrics.py +17 -3
- helm/benchmark/metrics/vision_language/image_metrics.py +6 -2
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +106 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/run_expander.py +27 -0
- helm/benchmark/run_specs/bhasa_run_specs.py +27 -13
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/vlm_run_specs.py +8 -3
- helm/benchmark/scenarios/bhasa_scenario.py +226 -82
- helm/benchmark/scenarios/raft_scenario.py +1 -1
- helm/benchmark/static/schema_bhasa.yaml +10 -10
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_safety.yaml +25 -6
- helm/benchmark/static/schema_tables.yaml +26 -2
- helm/benchmark/static/schema_vhelm.yaml +42 -11
- helm/benchmark/static_build/assets/index-3ee38b3d.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/index.html +1 -1
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/clients/openai_client.py +16 -1
- helm/clients/palmyra_client.py +1 -2
- helm/clients/together_client.py +22 -0
- helm/common/cache.py +8 -30
- helm/common/key_value_store.py +9 -9
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/test_cache.py +1 -48
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +135 -3
- helm/config/model_metadata.yaml +134 -6
- helm/config/tokenizer_configs.yaml +24 -0
- helm/proxy/server.py +0 -9
- helm/proxy/services/remote_service.py +0 -6
- helm/proxy/services/server_service.py +5 -18
- helm/proxy/services/service.py +0 -6
- helm/benchmark/data_overlap/__init__.py +0 -0
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -578,14 +578,18 @@ def get_lindsea_syntax_minimal_pairs_spec(language: str = "id", method: str = "m
|
|
|
578
578
|
scenario_spec=scenario_spec,
|
|
579
579
|
adapter_spec=adapter_spec,
|
|
580
580
|
metric_specs=get_exact_match_metric_specs(),
|
|
581
|
-
groups=[
|
|
581
|
+
groups=[
|
|
582
|
+
"bhasa_linguistic",
|
|
583
|
+
f"lindsea_syntax_minimal_pairs_{language}",
|
|
584
|
+
f"lindsea_syntax_minimal_pairs_{method}_{language}",
|
|
585
|
+
],
|
|
582
586
|
)
|
|
583
587
|
|
|
584
588
|
|
|
585
|
-
# 2.1. Pragmatics: LINDSEA
|
|
586
|
-
@run_spec_function("
|
|
587
|
-
def
|
|
588
|
-
name = f"
|
|
589
|
+
# 2.1. Pragmatics: LINDSEA Presuppositions
|
|
590
|
+
@run_spec_function("lindsea_pragmatics_presuppositions")
|
|
591
|
+
def get_lindsea_pragmatics_presuppositions_spec(language: str = "id", subset: str = "all") -> RunSpec:
|
|
592
|
+
name = f"lindsea_pragmatics_presuppositions_{subset}_{language}"
|
|
589
593
|
|
|
590
594
|
adapter_spec = get_generation_adapter_spec(
|
|
591
595
|
output_noun=LINDSEA_OUTPUT_NOUNS[language],
|
|
@@ -595,9 +599,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
|
|
|
595
599
|
)
|
|
596
600
|
|
|
597
601
|
scenario_spec = ScenarioSpec(
|
|
598
|
-
class_name="helm.benchmark.scenarios.bhasa_scenario.
|
|
602
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsPresuppositionsScenario",
|
|
599
603
|
args={
|
|
600
604
|
"language": language,
|
|
605
|
+
"subset": subset,
|
|
601
606
|
},
|
|
602
607
|
)
|
|
603
608
|
|
|
@@ -606,14 +611,18 @@ def get_lindsea_pragmatics_pragmatic_reasoning_single_spec(language="id") -> Run
|
|
|
606
611
|
scenario_spec=scenario_spec,
|
|
607
612
|
adapter_spec=adapter_spec,
|
|
608
613
|
metric_specs=get_exact_match_metric_specs(),
|
|
609
|
-
groups=[
|
|
614
|
+
groups=[
|
|
615
|
+
"bhasa_linguistic",
|
|
616
|
+
f"lindsea_pragmatics_presuppositions_{language}",
|
|
617
|
+
f"lindsea_pragmatics_presuppositions_{subset}_{language}",
|
|
618
|
+
],
|
|
610
619
|
)
|
|
611
620
|
|
|
612
621
|
|
|
613
|
-
# 2.2. Pragmatics: LINDSEA
|
|
614
|
-
@run_spec_function("
|
|
615
|
-
def
|
|
616
|
-
name = f"
|
|
622
|
+
# 2.2. Pragmatics: LINDSEA Scalar Implicatures
|
|
623
|
+
@run_spec_function("lindsea_pragmatics_scalar_implicatures")
|
|
624
|
+
def get_lindsea_pragmatics_scalar_implicatures_spec(language: str = "id", subset: str = "all") -> RunSpec:
|
|
625
|
+
name = f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}"
|
|
617
626
|
|
|
618
627
|
adapter_spec = get_generation_adapter_spec(
|
|
619
628
|
output_noun=LINDSEA_OUTPUT_NOUNS[language],
|
|
@@ -623,9 +632,10 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
|
|
|
623
632
|
)
|
|
624
633
|
|
|
625
634
|
scenario_spec = ScenarioSpec(
|
|
626
|
-
class_name="helm.benchmark.scenarios.bhasa_scenario.
|
|
635
|
+
class_name="helm.benchmark.scenarios.bhasa_scenario.LINDSEAPragmaticsScalarImplicaturesScenario",
|
|
627
636
|
args={
|
|
628
637
|
"language": language,
|
|
638
|
+
"subset": subset,
|
|
629
639
|
},
|
|
630
640
|
)
|
|
631
641
|
|
|
@@ -634,5 +644,9 @@ def get_lindsea_pragmatics_pragmatic_reasoning_pair_spec(language="id") -> RunSp
|
|
|
634
644
|
scenario_spec=scenario_spec,
|
|
635
645
|
adapter_spec=adapter_spec,
|
|
636
646
|
metric_specs=get_exact_match_metric_specs(),
|
|
637
|
-
groups=[
|
|
647
|
+
groups=[
|
|
648
|
+
"bhasa_linguistic",
|
|
649
|
+
f"lindsea_pragmatics_scalar_implicatures_{language}",
|
|
650
|
+
f"lindsea_pragmatics_scalar_implicatures_{subset}_{language}",
|
|
651
|
+
],
|
|
638
652
|
)
|
|
@@ -89,10 +89,14 @@ def get_banking77_spec() -> RunSpec:
|
|
|
89
89
|
|
|
90
90
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.banking77_scenario.Banking77Scenario", args={})
|
|
91
91
|
|
|
92
|
-
# Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77
|
|
92
|
+
# Use same AdapterSpec and instruction prompts as the RAFT implementation of BANKING77,
|
|
93
|
+
# with a slight modification to the instruction prompt for instruction-following models.
|
|
93
94
|
scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), Banking77Scenario.name)
|
|
95
|
+
instructions = get_raft_instructions("banking_77", scenario_cache_path).replace(
|
|
96
|
+
"\n", " Answer with only the label for the last query.\n", 1
|
|
97
|
+
)
|
|
94
98
|
adapter_spec = get_generation_adapter_spec(
|
|
95
|
-
instructions=
|
|
99
|
+
instructions=instructions,
|
|
96
100
|
input_noun=None,
|
|
97
101
|
output_noun="Label",
|
|
98
102
|
max_tokens=30, # at most ~50 characters per label
|
|
@@ -690,13 +690,18 @@ def get_bingo_spec(subject: str, num_respondents: int) -> RunSpec:
|
|
|
690
690
|
+ _get_open_ended_generation_metric_specs()
|
|
691
691
|
)
|
|
692
692
|
|
|
693
|
-
|
|
693
|
+
group_name: str = "bingo"
|
|
694
|
+
if subject == "Region":
|
|
695
|
+
group_name += "_fairness"
|
|
696
|
+
elif subject == "OCR":
|
|
697
|
+
group_name += "_multilinguality"
|
|
698
|
+
|
|
694
699
|
return RunSpec(
|
|
695
|
-
name=f"
|
|
700
|
+
name=f"bingo:subject={subject}",
|
|
696
701
|
scenario_spec=scenario_spec,
|
|
697
702
|
adapter_spec=adapter_spec,
|
|
698
703
|
metric_specs=metric_specs,
|
|
699
|
-
groups=[
|
|
704
|
+
groups=[group_name],
|
|
700
705
|
)
|
|
701
706
|
|
|
702
707
|
|
|
@@ -171,7 +171,7 @@ class XQuADScenario(Scenario):
|
|
|
171
171
|
super().__init__()
|
|
172
172
|
self.language = language
|
|
173
173
|
self.splits = {"train": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
174
|
-
self.
|
|
174
|
+
self.language_to_prompt_components = {
|
|
175
175
|
"th": {
|
|
176
176
|
"passage_prefix": "ข้อความ: ",
|
|
177
177
|
"question_prefix": "คำถาม: ",
|
|
@@ -183,13 +183,19 @@ class XQuADScenario(Scenario):
|
|
|
183
183
|
"random_state": 4502,
|
|
184
184
|
},
|
|
185
185
|
}
|
|
186
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
187
|
+
raise Exception(
|
|
188
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
186
192
|
|
|
187
193
|
def get_instances(self, output_path) -> List[Instance]:
|
|
188
194
|
dataset = datasets.load_dataset("xquad", f"xquad.{self.language}", split="validation")
|
|
189
195
|
df = dataset.to_pandas()
|
|
190
196
|
|
|
191
197
|
# Sample 1000 examples for test
|
|
192
|
-
df_test = df.sample(n=1000, random_state=self.
|
|
198
|
+
df_test = df.sample(n=1000, random_state=self.prompt_components["random_state"])
|
|
193
199
|
|
|
194
200
|
# In-context examples to be drawn from remaining examples (since there is no train data)
|
|
195
201
|
df_train = df[~df.index.isin(df_test.index)]
|
|
@@ -210,8 +216,8 @@ class XQuADScenario(Scenario):
|
|
|
210
216
|
input = PassageQuestionInput(
|
|
211
217
|
passage=passage,
|
|
212
218
|
question=question,
|
|
213
|
-
passage_prefix=str(self.
|
|
214
|
-
question_prefix=str(self.
|
|
219
|
+
passage_prefix=str(self.prompt_components["passage_prefix"]),
|
|
220
|
+
question_prefix=str(self.prompt_components["question_prefix"]),
|
|
215
221
|
)
|
|
216
222
|
references = []
|
|
217
223
|
for answer in row["answers"]["text"]:
|
|
@@ -1068,6 +1074,9 @@ class FloresScenario(Scenario):
|
|
|
1068
1074
|
"ta": "tam_Taml",
|
|
1069
1075
|
}
|
|
1070
1076
|
|
|
1077
|
+
if self.source not in self.languages.keys() or self.target not in self.languages.keys():
|
|
1078
|
+
raise Exception(f"Unsupported language/s - supported languages are {self.languages.keys()}")
|
|
1079
|
+
|
|
1071
1080
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1072
1081
|
source_dataset = datasets.load_dataset(
|
|
1073
1082
|
"facebook/flores",
|
|
@@ -1259,6 +1268,9 @@ class XNLIScenario(Scenario):
|
|
|
1259
1268
|
"test": TEST_SPLIT,
|
|
1260
1269
|
}
|
|
1261
1270
|
self.id2label = {0: "A", 2: "B", 1: "C"}
|
|
1271
|
+
self.supported_languages = ["th", "vi"]
|
|
1272
|
+
if self.language not in self.supported_languages:
|
|
1273
|
+
raise Exception(f"{self.language} not supported. Supported languages are {self.supported_languages}.")
|
|
1262
1274
|
|
|
1263
1275
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1264
1276
|
dataset = datasets.load_dataset("xnli", self.language)
|
|
@@ -1449,7 +1461,7 @@ class XCOPAScenario(Scenario):
|
|
|
1449
1461
|
0: "A",
|
|
1450
1462
|
1: "B",
|
|
1451
1463
|
}
|
|
1452
|
-
self.
|
|
1464
|
+
self.language_to_prompt_components = {
|
|
1453
1465
|
"id": {
|
|
1454
1466
|
"cause": "sebab",
|
|
1455
1467
|
"effect": "akibat",
|
|
@@ -1476,6 +1488,12 @@ class XCOPAScenario(Scenario):
|
|
|
1476
1488
|
"instruction2": "Trả lời với một chữ cái duy nhất A hoặc B.",
|
|
1477
1489
|
},
|
|
1478
1490
|
}
|
|
1491
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1492
|
+
raise Exception(
|
|
1493
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1494
|
+
)
|
|
1495
|
+
else:
|
|
1496
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1479
1497
|
|
|
1480
1498
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1481
1499
|
language_dataset = datasets.load_dataset("xcopa", self.language)
|
|
@@ -1489,15 +1507,13 @@ class XCOPAScenario(Scenario):
|
|
|
1489
1507
|
language_df, tamil_df[["question", "idx"]], on="idx"
|
|
1490
1508
|
) # Use the Tamil split's question column
|
|
1491
1509
|
for _, row in data.iterrows():
|
|
1492
|
-
instruction1 = self.
|
|
1493
|
-
self.prompt[self.language][row["question_y"]]
|
|
1494
|
-
)
|
|
1510
|
+
instruction1 = self.prompt_components["instruction1"].format(self.prompt_components[row["question_y"]])
|
|
1495
1511
|
passage = "{premise}\n{instruction1}\nA: {choice1}\nB: {choice2}\n{instruction2}".format(
|
|
1496
1512
|
premise=row["premise"].strip(),
|
|
1497
1513
|
instruction1=instruction1,
|
|
1498
1514
|
choice1=row["choice1"].strip(),
|
|
1499
1515
|
choice2=row["choice2"].strip(),
|
|
1500
|
-
instruction2=self.
|
|
1516
|
+
instruction2=self.prompt_components["instruction2"],
|
|
1501
1517
|
)
|
|
1502
1518
|
input = Input(passage)
|
|
1503
1519
|
output = Output(self.id2label[int(row["label"])])
|
|
@@ -1549,18 +1565,24 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1549
1565
|
|
|
1550
1566
|
name = "lindsea_minimal_pairs"
|
|
1551
1567
|
description = "LINDSEA minimal pairs task"
|
|
1552
|
-
tags = ["
|
|
1568
|
+
tags = ["linguistic_diagnostic", "syntax", "minimal_pairs"]
|
|
1553
1569
|
|
|
1554
1570
|
def __init__(self, method: str, language: str):
|
|
1555
1571
|
super().__init__()
|
|
1556
1572
|
self.method = method
|
|
1557
1573
|
self.language = language
|
|
1558
|
-
self.
|
|
1574
|
+
self.language_to_prompt_components = {
|
|
1559
1575
|
"id": {
|
|
1560
1576
|
"instructions": "Kalimat mana yang lebih mungkin?",
|
|
1561
1577
|
"output_prefix": "Jawablah dengan satu huruf saja, A atau B.",
|
|
1562
1578
|
}
|
|
1563
1579
|
}
|
|
1580
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1581
|
+
raise Exception(
|
|
1582
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1583
|
+
)
|
|
1584
|
+
else:
|
|
1585
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1564
1586
|
|
|
1565
1587
|
def download_dataset(self, output_path: str):
|
|
1566
1588
|
BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
|
|
@@ -1586,6 +1608,7 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1586
1608
|
outputs = []
|
|
1587
1609
|
if self.method == "mcq":
|
|
1588
1610
|
category_list = data["category"].value_counts().keys()
|
|
1611
|
+
|
|
1589
1612
|
hlog("MCQ method for LINDSEA Minimal Pairs chosen. Shuffling options...")
|
|
1590
1613
|
for category in category_list:
|
|
1591
1614
|
# Fix shuffling within each category
|
|
@@ -1594,10 +1617,8 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1594
1617
|
options = [(row["correct"], 1), (row["wrong"], 2)]
|
|
1595
1618
|
random.shuffle(options)
|
|
1596
1619
|
options_reversed = True if options[0][1] == 2 else False
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
instructions = prompt_components["instructions"]
|
|
1600
|
-
output_prefix = prompt_components["output_prefix"]
|
|
1620
|
+
instructions = self.prompt_components["instructions"]
|
|
1621
|
+
output_prefix = self.prompt_components["output_prefix"]
|
|
1601
1622
|
prompt = f"{instructions}\nA: {options[0][0]}\nB: {options[1][0]}\n{output_prefix}"
|
|
1602
1623
|
input = Input(text=prompt)
|
|
1603
1624
|
# Determine correct option based on whether shuffling reversed the options
|
|
@@ -1625,23 +1646,31 @@ class LINDSEASyntaxMinimalPairsScenario(Scenario):
|
|
|
1625
1646
|
return outputs
|
|
1626
1647
|
|
|
1627
1648
|
|
|
1628
|
-
# 2. Pragmatics
|
|
1629
|
-
|
|
1630
|
-
class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
|
|
1649
|
+
# 2.1 Pragmatics: LINDSEA Presuppositions
|
|
1650
|
+
class LINDSEAPragmaticsPresuppositionsScenario(Scenario):
|
|
1631
1651
|
"""
|
|
1632
|
-
The LINDSEA
|
|
1652
|
+
The LINDSEA Presuppositions dataset is a linguistic diagnostic scenario targeting pragmatics.
|
|
1633
1653
|
The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
|
|
1634
|
-
of quality control.
|
|
1654
|
+
of quality control.
|
|
1635
1655
|
|
|
1636
|
-
The
|
|
1637
|
-
|
|
1656
|
+
The presuppositions dataset involves two formats: single and pair sentences.
|
|
1657
|
+
For single sentence questions, the system under test needs to determine if the sentence is true/false.
|
|
1658
|
+
For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
|
|
1659
|
+
from another sentence.
|
|
1638
1660
|
|
|
1639
|
-
|
|
1661
|
+
For the single format, the models are prompted using the following general format:
|
|
1640
1662
|
|
|
1641
1663
|
Is the following statement true or false?
|
|
1642
1664
|
Statement: <sentence>
|
|
1643
1665
|
Answer only with True or False.
|
|
1644
1666
|
|
|
1667
|
+
For the pair format, the models are prompted using the following general format:
|
|
1668
|
+
|
|
1669
|
+
Situation: <premise>
|
|
1670
|
+
Given this situation, is the following statement true or false?
|
|
1671
|
+
Statement: <hypothesis>
|
|
1672
|
+
Answer only with True or False.
|
|
1673
|
+
|
|
1645
1674
|
Target completion:
|
|
1646
1675
|
<answer>
|
|
1647
1676
|
|
|
@@ -1661,50 +1690,101 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
|
|
|
1661
1690
|
}
|
|
1662
1691
|
"""
|
|
1663
1692
|
|
|
1664
|
-
name = "
|
|
1665
|
-
description = "LINDSEA
|
|
1666
|
-
tags = ["
|
|
1693
|
+
name = "lindsea_pragmatics_presuppositions"
|
|
1694
|
+
description = "LINDSEA presuppositions task"
|
|
1695
|
+
tags = ["linguistic_diagnostic", "pragmatics", "presuppositions"]
|
|
1667
1696
|
|
|
1668
|
-
def __init__(self, language: str):
|
|
1697
|
+
def __init__(self, language: str, subset: str):
|
|
1669
1698
|
super().__init__()
|
|
1670
1699
|
self.language = language
|
|
1671
|
-
self.
|
|
1700
|
+
self.subsets = [subset] if subset != "all" else ["single", "pair"]
|
|
1701
|
+
self.language_to_prompt_components = {
|
|
1672
1702
|
"id": {
|
|
1673
|
-
"
|
|
1674
|
-
"
|
|
1703
|
+
"text_noun": "Pernyataan",
|
|
1704
|
+
"premise_noun": "Situasi",
|
|
1705
|
+
"conclusion_noun": "Pernyataan",
|
|
1706
|
+
"single_question": "Apakah pernyataan berikut ini {}?",
|
|
1707
|
+
"single_instruction": "Jawablah dengan {} saja.",
|
|
1708
|
+
"pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
|
|
1709
|
+
"pair_instruction": "Jawablah dengan Benar atau Salah saja.",
|
|
1710
|
+
"True": "Benar",
|
|
1711
|
+
"False": "Salah",
|
|
1675
1712
|
},
|
|
1676
1713
|
}
|
|
1714
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1715
|
+
raise Exception(
|
|
1716
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1717
|
+
)
|
|
1718
|
+
else:
|
|
1719
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1677
1720
|
|
|
1678
1721
|
def download_dataset(self, output_path: str):
|
|
1679
1722
|
BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1723
|
+
datasets = []
|
|
1724
|
+
for subset in self.subsets:
|
|
1725
|
+
URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
|
|
1726
|
+
file = f"pragmatic_reasoning_{subset}.jsonl"
|
|
1727
|
+
target_path_file = os.path.join(output_path, file)
|
|
1728
|
+
ensure_file_downloaded(source_url=URL, target_path=target_path_file)
|
|
1729
|
+
data = pd.read_json(target_path_file, lines=True)
|
|
1730
|
+
data["subset"] = subset
|
|
1731
|
+
data = data[data["linguistic_phenomenon"] == "presuppositions"]
|
|
1732
|
+
datasets.append(data)
|
|
1733
|
+
dataset = pd.concat(datasets)
|
|
1685
1734
|
return dataset
|
|
1686
1735
|
|
|
1687
1736
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1688
1737
|
data = self.download_dataset(output_path)
|
|
1689
1738
|
outputs = []
|
|
1690
1739
|
for _, row in data.iterrows():
|
|
1691
|
-
passage =
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1740
|
+
passage = None
|
|
1741
|
+
references = []
|
|
1742
|
+
|
|
1743
|
+
if row["subset"] == "single":
|
|
1744
|
+
question = self.prompt_components["single_question"]
|
|
1745
|
+
text_noun = self.prompt_components["text_noun"]
|
|
1746
|
+
instruction = self.prompt_components["single_instruction"]
|
|
1747
|
+
|
|
1748
|
+
passage = "{question}\{text_noun}: {text}\n{instruction}".format(
|
|
1749
|
+
question=question.format(row["question_translated"]),
|
|
1750
|
+
text_noun=text_noun,
|
|
1751
|
+
text=row["text"],
|
|
1752
|
+
instruction=instruction.format(row["choices_translated"]),
|
|
1753
|
+
)
|
|
1754
|
+
# Split "True or False" into ["True", "or", "False"]
|
|
1755
|
+
choices = row["choices"].split()
|
|
1756
|
+
choices_translated = row["choices_translated"].split()
|
|
1757
|
+
label2choice = {
|
|
1758
|
+
choices[0]: choices_translated[0],
|
|
1759
|
+
choices[2]: choices_translated[2],
|
|
1760
|
+
}
|
|
1761
|
+
references.append(
|
|
1762
|
+
Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
|
|
1763
|
+
)
|
|
1764
|
+
|
|
1765
|
+
elif row["subset"] == "pair":
|
|
1766
|
+
premise_noun = self.prompt_components["premise_noun"]
|
|
1767
|
+
question = self.prompt_components["pair_question"]
|
|
1768
|
+
conclusion_noun = self.prompt_components["conclusion_noun"]
|
|
1769
|
+
instruction = self.prompt_components["pair_instruction"]
|
|
1770
|
+
label = self.prompt_components[str(row["label"])]
|
|
1771
|
+
|
|
1772
|
+
passage = (
|
|
1773
|
+
"{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
|
|
1774
|
+
premise_noun=premise_noun,
|
|
1775
|
+
premise=row["text"],
|
|
1776
|
+
question=question,
|
|
1777
|
+
conclusion_noun=conclusion_noun,
|
|
1778
|
+
conclusion=row["conclusion"],
|
|
1779
|
+
instruction=instruction,
|
|
1780
|
+
)
|
|
1781
|
+
)
|
|
1782
|
+
|
|
1783
|
+
references.append(
|
|
1784
|
+
Reference(Output(text=label), tags=[CORRECT_TAG]),
|
|
1785
|
+
)
|
|
1786
|
+
|
|
1787
|
+
input = Input(text=str(passage))
|
|
1708
1788
|
instance = Instance(
|
|
1709
1789
|
input=input,
|
|
1710
1790
|
references=references,
|
|
@@ -1714,17 +1794,25 @@ class LINDSEAPragmaticsPragmaticReasoningSingleScenario(Scenario):
|
|
|
1714
1794
|
return outputs
|
|
1715
1795
|
|
|
1716
1796
|
|
|
1717
|
-
# 2.2 Pragmatics: LINDSEA
|
|
1718
|
-
class
|
|
1797
|
+
# 2.2 Pragmatics: LINDSEA Scalar Implicatures
|
|
1798
|
+
class LINDSEAPragmaticsScalarImplicaturesScenario(Scenario):
|
|
1719
1799
|
"""
|
|
1720
|
-
The LINDSEA
|
|
1800
|
+
The LINDSEA Scalar Implicatures Scenario dataset is a linguistic diagnostic scenario targeting pragmatics.
|
|
1721
1801
|
The data is manually handcrafted by linguists and native speakers and verified through multiple rounds
|
|
1722
|
-
of quality control.
|
|
1802
|
+
of quality control.
|
|
1723
1803
|
|
|
1724
|
-
The
|
|
1804
|
+
The scalar implicatures dataset involves two formats: single and pair sentences.
|
|
1805
|
+
For single sentence questions, the system under test needs to determine if the sentence is true/false.
|
|
1806
|
+
For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn
|
|
1725
1807
|
from another sentence.
|
|
1726
1808
|
|
|
1727
|
-
|
|
1809
|
+
For the single format, the models are prompted using the following general format:
|
|
1810
|
+
|
|
1811
|
+
Is the following statement true or false?
|
|
1812
|
+
Statement: <sentence>
|
|
1813
|
+
Answer only with True or False.
|
|
1814
|
+
|
|
1815
|
+
For the pair format, the models are prompted using the following general format:
|
|
1728
1816
|
|
|
1729
1817
|
Situation: <premise>
|
|
1730
1818
|
Given this situation, is the following statement true or false?
|
|
@@ -1750,45 +1838,101 @@ class LINDSEAPragmaticsPragmaticReasoningPairScenario(Scenario):
|
|
|
1750
1838
|
}
|
|
1751
1839
|
"""
|
|
1752
1840
|
|
|
1753
|
-
name = "
|
|
1754
|
-
description = "LINDSEA
|
|
1755
|
-
tags = ["
|
|
1841
|
+
name = "lindsea_pragmatics_scalar_implicatures"
|
|
1842
|
+
description = "LINDSEA scalar implicatures task"
|
|
1843
|
+
tags = ["linguistic_diagnostic", "pragmatics", "scalar_implicatures"]
|
|
1756
1844
|
|
|
1757
|
-
def __init__(self, language: str):
|
|
1845
|
+
def __init__(self, language: str, subset: str):
|
|
1758
1846
|
super().__init__()
|
|
1759
1847
|
self.language = language
|
|
1760
|
-
self.
|
|
1848
|
+
self.subsets = [subset] if subset != "all" else ["single", "pair"]
|
|
1849
|
+
self.language_to_prompt_components = {
|
|
1761
1850
|
"id": {
|
|
1762
|
-
"
|
|
1763
|
-
"
|
|
1764
|
-
|
|
1765
|
-
|
|
1851
|
+
"text_noun": "Pernyataan",
|
|
1852
|
+
"premise_noun": "Situasi",
|
|
1853
|
+
"conclusion_noun": "Pernyataan",
|
|
1854
|
+
"single_question": "Apakah pernyataan berikut ini {}?",
|
|
1855
|
+
"single_instruction": "Jawablah dengan {} saja.",
|
|
1856
|
+
"pair_question": "Berdasarkan situasi ini, apakah pernyataan berikut ini benar atau salah?",
|
|
1857
|
+
"pair_instruction": "Jawablah dengan Benar atau Salah saja.",
|
|
1858
|
+
"True": "Benar",
|
|
1859
|
+
"False": "Salah",
|
|
1766
1860
|
},
|
|
1767
1861
|
}
|
|
1862
|
+
if self.language not in self.language_to_prompt_components.keys():
|
|
1863
|
+
raise Exception(
|
|
1864
|
+
f"{self.language} not supported. Supported languages are {self.language_to_prompt_components.keys()}."
|
|
1865
|
+
)
|
|
1866
|
+
else:
|
|
1867
|
+
self.prompt_components = self.language_to_prompt_components[self.language]
|
|
1768
1868
|
|
|
1769
1869
|
def download_dataset(self, output_path: str):
|
|
1770
1870
|
BASE_URL = "https://raw.githubusercontent.com/aisingapore/BHASA/main/lindsea/"
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1871
|
+
datasets = []
|
|
1872
|
+
for subset in self.subsets:
|
|
1873
|
+
URL = f"{BASE_URL}{self.language}/pragmatics/pragmatic_reasoning_{subset}.jsonl"
|
|
1874
|
+
file = f"pragmatic_reasoning_{subset}.jsonl"
|
|
1875
|
+
target_path_file = os.path.join(output_path, file)
|
|
1876
|
+
ensure_file_downloaded(source_url=URL, target_path=target_path_file)
|
|
1877
|
+
data = pd.read_json(target_path_file, lines=True)
|
|
1878
|
+
data["subset"] = subset
|
|
1879
|
+
data = data[data["linguistic_phenomenon"] == "scalar_implicatures"]
|
|
1880
|
+
datasets.append(data)
|
|
1881
|
+
dataset = pd.concat(datasets)
|
|
1776
1882
|
return dataset
|
|
1777
1883
|
|
|
1778
1884
|
def get_instances(self, output_path) -> List[Instance]:
|
|
1779
1885
|
data = self.download_dataset(output_path)
|
|
1780
1886
|
outputs = []
|
|
1781
1887
|
for _, row in data.iterrows():
|
|
1782
|
-
passage =
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1888
|
+
passage = None
|
|
1889
|
+
references = []
|
|
1890
|
+
|
|
1891
|
+
if row["subset"] == "single":
|
|
1892
|
+
question = self.prompt_components["single_question"]
|
|
1893
|
+
text_noun = self.prompt_components["text_noun"]
|
|
1894
|
+
instruction = self.prompt_components["single_instruction"]
|
|
1895
|
+
|
|
1896
|
+
passage = "{question}\{text_noun}: {text}\n{instruction}".format(
|
|
1897
|
+
question=question.format(row["question_translated"]),
|
|
1898
|
+
text_noun=text_noun,
|
|
1899
|
+
text=row["text"],
|
|
1900
|
+
instruction=instruction.format(row["choices_translated"]),
|
|
1901
|
+
)
|
|
1902
|
+
# Split "True or False" into ["True", "or", "False"]
|
|
1903
|
+
choices = row["choices"].split()
|
|
1904
|
+
choices_translated = row["choices_translated"].split()
|
|
1905
|
+
label2choice = {
|
|
1906
|
+
choices[0]: choices_translated[0],
|
|
1907
|
+
choices[2]: choices_translated[2],
|
|
1908
|
+
}
|
|
1909
|
+
references.append(
|
|
1910
|
+
Reference(Output(text=label2choice[row["label"].strip()]), tags=[CORRECT_TAG]),
|
|
1911
|
+
)
|
|
1912
|
+
|
|
1913
|
+
elif row["subset"] == "pair":
|
|
1914
|
+
premise_noun = self.prompt_components["premise_noun"]
|
|
1915
|
+
question = self.prompt_components["pair_question"]
|
|
1916
|
+
conclusion_noun = self.prompt_components["conclusion_noun"]
|
|
1917
|
+
instruction = self.prompt_components["pair_instruction"]
|
|
1918
|
+
label = self.prompt_components[str(row["label"])]
|
|
1919
|
+
|
|
1920
|
+
passage = (
|
|
1921
|
+
"{premise_noun}: {premise}\n{question}\n{conclusion_noun}: {conclusion}\n{instruction}".format(
|
|
1922
|
+
premise_noun=premise_noun,
|
|
1923
|
+
premise=row["text"],
|
|
1924
|
+
question=question,
|
|
1925
|
+
conclusion_noun=conclusion_noun,
|
|
1926
|
+
conclusion=row["conclusion"],
|
|
1927
|
+
instruction=instruction,
|
|
1928
|
+
)
|
|
1929
|
+
)
|
|
1930
|
+
|
|
1931
|
+
references.append(
|
|
1932
|
+
Reference(Output(text=label), tags=[CORRECT_TAG]),
|
|
1933
|
+
)
|
|
1934
|
+
|
|
1935
|
+
input = Input(text=str(passage))
|
|
1792
1936
|
instance = Instance(
|
|
1793
1937
|
input=input,
|
|
1794
1938
|
references=references,
|
|
@@ -40,7 +40,7 @@ def get_raft_prompt_settings(subset: str, cache_dir: str):
|
|
|
40
40
|
return field_ordering[subset], instructions[subset]
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def get_raft_instructions(subset: str, cache_dir: str):
|
|
43
|
+
def get_raft_instructions(subset: str, cache_dir: str) -> str:
|
|
44
44
|
return get_raft_prompt_settings(subset, cache_dir)[1]
|
|
45
45
|
|
|
46
46
|
|